189 files changed, 66546 insertions, 0 deletions
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
new file mode 100644
index 00000000..5a0401fa
--- /dev/null
+++ b/storage/innobase/include/btr0btr.h
@@ -0,0 +1,543 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2014, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0btr.h
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "dict0dict.h"
+#include "data0data.h"
+#include "rem0types.h"
+#include "page0cur.h"
+#include "btr0types.h"
+#include "gis0type.h"
+
+#define BTR_MAX_NODE_LEVEL	50	/*!< Maximum B-tree page level
+					(not really a hard limit).
+					Used in debug assertions
+					in btr_page_set_level and
+					btr_page_get_level */
+
+/** Maximum record size which can be stored on a page, without using the
+special big record storage structure */
+#define	BTR_PAGE_MAX_REC_SIZE	(srv_page_size / 2 - 200)
+
+/** @brief Maximum depth of a B-tree in InnoDB.
+
+Note that this isn't a maximum as such; none of the tree operations
+avoid producing trees bigger than this. It is instead a "max depth
+that other code must work with", useful for e.g.  fixed-size arrays
+that must store some information about each level in a tree. In other
+words: if a B-tree with bigger depth than this is encountered, it is
+not acceptable for it to lead to mysterious memory corruption, but it
+is acceptable for the program to die with a clear assert failure. */
+#define BTR_MAX_LEVELS		100
+
+#define BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode)		\
+	btr_latch_mode((latch_mode) & ~(BTR_INSERT	\
+				| BTR_DELETE_MARK		\
+				| BTR_RTREE_UNDO_INS		\
+				| BTR_RTREE_DELETE_MARK		\
+				| BTR_DELETE			\
+				| BTR_IGNORE_SEC_UNIQUE		\
+				| BTR_ALREADY_S_LATCHED		\
+				| BTR_LATCH_FOR_INSERT		\
+				| BTR_LATCH_FOR_DELETE))
+
+#define BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode)			\
+	btr_latch_mode((latch_mode)					\
+		       & ~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE))
+
+/**************************************************************//**
+Checks and adjusts the root node of a tree during IMPORT TABLESPACE.
+@return error code, or DB_SUCCESS */
+dberr_t
+btr_root_adjust_on_import(
+/*======================*/
+	const dict_index_t*	index)	/*!< in: index tree */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Report a decryption failure. */
+ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index);
+
+/** Get an index page and declare its latching order level.
+@param[in]	index	index tree
+@param[in]	page	page number
+@param[in]	mode	latch mode
+@param[in]	merge	whether change buffer merge should be attempted
+@param[in,out]	mtr	mini-transaction
+@param[out]	err	error code
+@return block */
+buf_block_t *btr_block_get(const dict_index_t &index,
+                           uint32_t page, rw_lock_type_t mode, bool merge,
+                           mtr_t *mtr, dberr_t *err= nullptr);
+
+/**************************************************************//**
+Gets the index id field of a page.
+@return index id */
+UNIV_INLINE
+index_id_t
+btr_page_get_index_id(
+/*==================*/
+	const page_t*	page)	/*!< in: index page */
+	MY_ATTRIBUTE((warn_unused_result));
+/** Read the B-tree or R-tree PAGE_LEVEL.
+@param page B-tree or R-tree page
+@return number of child page links to reach the leaf level
+@retval 0 for leaf pages */
+inline uint16_t btr_page_get_level(const page_t *page)
+{
+  uint16_t level= mach_read_from_2(my_assume_aligned<2>
+                                   (PAGE_HEADER + PAGE_LEVEL + page));
+  ut_ad(level <= BTR_MAX_NODE_LEVEL);
+  return level;
+} MY_ATTRIBUTE((warn_unused_result))
+
+/** Read FIL_PAGE_NEXT.
+@param page  buffer pool page
+@return previous page number */
+inline uint32_t btr_page_get_next(const page_t* page)
+{
+  return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT));
+}
+
+/** Read FIL_PAGE_PREV.
+@param page  buffer pool page
+@return previous page number */
+inline uint32_t btr_page_get_prev(const page_t* page)
+{
+  return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV));
+}
+
+/**************************************************************//**
+Gets the child node file address in a node pointer.
+NOTE: the offsets array must contain all offsets for the record since
+we read the last field according to offsets and assume that it contains
+the child page number. In other words offsets must have been retrieved
+with rec_get_offsets(n_fields=ULINT_UNDEFINED).
+@return child node address */
+UNIV_INLINE
+uint32_t
+btr_node_ptr_get_child_page_no(
+/*===========================*/
+	const rec_t*	rec,	/*!< in: node pointer record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Create the root node for a new index tree.
+@param[in]	type			type of the index
+@param[in,out]	space			tablespace where created
+@param[in]	index_id		index id
+@param[in]	index			index, or NULL to create a system table
+@param[in,out]	mtr			mini-transaction
+@param[out]	err			error code
+@return	page number of the created root
+@retval	FIL_NULL	if did not succeed */
+uint32_t
+btr_create(
+	ulint			type,
+	fil_space_t*		space,
+	index_id_t		index_id,
+	dict_index_t*		index,
+	mtr_t*			mtr,
+	dberr_t*		err)
+	MY_ATTRIBUTE((nonnull(2,5,6), warn_unused_result));
+
+/** Free a persistent index tree if it exists.
+@param[in,out]	space		tablespce
+@param[in]	page		root page number
+@param[in]	index_id	PAGE_INDEX_ID contents
+@param[in,out]	mtr		mini-transaction */
+void btr_free_if_exists(fil_space_t *space, uint32_t page,
+                        index_id_t index_id, mtr_t *mtr);
+
+/** Drop a temporary table
+@param table   temporary table */
+void btr_drop_temporary_table(const dict_table_t &table);
+
+/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC.
+@param[in,out]	index	clustered index
+@return	the last used AUTO_INCREMENT value
+@retval	0 on error or if no AUTO_INCREMENT value was used yet */
+ib_uint64_t
+btr_read_autoinc(dict_index_t* index)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC,
+or fall back to MAX(auto_increment_column).
+@param[in]	table	table containing an AUTO_INCREMENT column
+@param[in]	col_no	index of the AUTO_INCREMENT column
+@return	the AUTO_INCREMENT value
+@retval	0 on error or if no AUTO_INCREMENT value was used yet */
+ib_uint64_t
+btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Write the next available AUTO_INCREMENT value to PAGE_ROOT_AUTO_INC.
+@param[in,out]	index	clustered index
+@param[in]	autoinc	the AUTO_INCREMENT value
+@param[in]	reset	whether to reset the AUTO_INCREMENT
+			to a possibly smaller value than currently
+			exists in the page */
+void
+btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset = false)
+	MY_ATTRIBUTE((nonnull));
+
+/** Write instant ALTER TABLE metadata to a root page.
+@param[in,out]	root	clustered index root page
+@param[in]	index	clustered index with instant ALTER TABLE
+@param[in,out]	mtr	mini-transaction */
+void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr);
+
+ATTRIBUTE_COLD __attribute__((nonnull))
+/** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE.
+@param[in]      index   clustered index with instant ALTER TABLE
+@param[in]      all     whether to reset FIL_PAGE_TYPE as well
+@param[in,out]  mtr     mini-transaction */
+void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr);
+
+/*************************************************************//**
+Makes tree one level higher by splitting the root, and inserts
+the tuple. It is assumed that mtr contains an x-latch on the tree.
+NOTE that the operation of this function must always succeed,
+we cannot reverse it: therefore enough free disk space must be
+guaranteed to be available before this function is called.
+@return inserted record */
+rec_t*
+btr_root_raise_and_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert: must be
+				on the root page; when the function returns,
+				the cursor is positioned on the predecessor
+				of the inserted record */
+	rec_offs**	offsets,/*!< out: offsets on inserted record */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
+				that can be emptied, or NULL */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr,	/*!< in: mtr */
+	dberr_t*	err)	/*!< out: error code */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@param cursor  page cursor
+@param mtr     mini-transaction
+@return error code
+@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */
+dberr_t btr_page_reorganize(page_cur_t *cursor, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Decide if the page should be split at the convergence point of inserts
+converging to the left.
+@param[in]	cursor	insert position
+@return the first record to be moved to the right half page
+@retval	NULL if no split is recommended */
+rec_t* btr_page_get_split_rec_to_left(const btr_cur_t* cursor);
+/** Decide if the page should be split at the convergence point of inserts
+converging to the right.
+@param[in]	cursor		insert position
+@param[out]	split_rec	if split recommended, the first record
+				on the right half page, or
+				NULL if the to-be-inserted record
+				should be first
+@return whether split is recommended */
+bool
+btr_page_get_split_rec_to_right(const btr_cur_t* cursor, rec_t** split_rec);
+
+/*************************************************************//**
+Splits an index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
+released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore enough
+free disk space (2 pages) must be guaranteed to be available before
+this function is called.
+
+@return inserted record */
+rec_t*
+btr_page_split_and_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert; when the
+				function returns, the cursor is positioned
+				on the predecessor of the inserted record */
+	rec_offs**	offsets,/*!< out: offsets on inserted record */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
+				that can be emptied, or NULL */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr,	/*!< in: mtr */
+	dberr_t*	err)	/*!< out: error code */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*******************************************************//**
+Inserts a data tuple to a tree on a non-leaf level. It is assumed
+that mtr holds an x-latch on the tree. */
+dberr_t
+btr_insert_on_non_leaf_level(
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: level, must be > 0 */
+	dtuple_t*	tuple,	/*!< in: the record to be inserted */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Set a child page pointer record as the predefined minimum record.
+@tparam has_prev  whether the page is supposed to have a left sibling
+@param[in,out]  rec     leftmost record on a leftmost non-leaf page
+@param[in,out]  block   buffer pool block
+@param[in,out]  mtr     mini-transaction */
+template<bool has_prev= false>
+inline void btr_set_min_rec_mark(rec_t *rec, const buf_block_t &block,
+                                 mtr_t *mtr)
+{
+  ut_ad(block.page.frame == page_align(rec));
+  ut_ad(!page_is_leaf(block.page.frame));
+  ut_ad(has_prev == page_has_prev(block.page.frame));
+
+  rec-= page_rec_is_comp(rec) ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS;
+
+  if (block.page.zip.data)
+    /* This flag is computed from other contents on a ROW_FORMAT=COMPRESSED
+    page. We are not modifying the compressed page frame at all. */
+    *rec|= REC_INFO_MIN_REC_FLAG;
+  else
+    mtr->write<1>(block, rec, *rec | REC_INFO_MIN_REC_FLAG);
+}
+
+/** Seek to the parent page of a B-tree page.
+@param[in,out]	mtr	mini-transaction
+@param[in,out]	cursor	cursor pointing to the x-latched parent page
+@return whether the cursor was successfully positioned */
+bool btr_page_get_father(mtr_t* mtr, btr_cur_t* cursor)
+	MY_ATTRIBUTE((nonnull,warn_unused_result));
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Checks that the node pointer to a page is appropriate.
+@return TRUE */
+ibool
+btr_check_node_ptr(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: index page */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((warn_unused_result));
+#endif /* UNIV_DEBUG */
+/*************************************************************//**
+Tries to merge the page first to the left immediate brother if such a
+brother exists, and the node pointers to the current page and to the
+brother reside on the same page. If the left brother does not satisfy these
+conditions, looks at the right brother. If the page is the only one on that
+level lifts the records of the page to the father page, thus reducing the
+tree height. It is assumed that mtr holds an x-latch on the tree and on the
+page. If cursor is on the leaf level, mtr must also hold x-latches to
+the brothers, if they exist.
+@return error code
+@retval DB_FAIL if the tree could not be merged */
+dberr_t
+btr_compress(
+/*=========*/
+	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to merge
+				or lift; the page must not be empty:
+				when deleting records, use btr_discard_page()
+				if the page would become empty */
+	bool		adjust,	/*!< in: whether the cursor position should be
+				adjusted even when compression occurs */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Discards a page from a B-tree. This is used to remove the last record from
+a B-tree page: the whole page must be removed at the same time. This cannot
+be used for the root page, which is allowed to be empty. */
+dberr_t
+btr_discard_page(
+/*=============*/
+	btr_cur_t*	cursor,	/*!< in: cursor on the page to discard: not on
+				the root page */
+	mtr_t*		mtr);	/*!< in: mtr */
+
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@retval NULL if no page could be allocated */
+buf_block_t*
+btr_page_alloc(
+/*===========*/
+	dict_index_t*	index,		/*!< in: index tree */
+	uint32_t	hint_page_no,	/*!< in: hint of a good page */
+	byte		file_direction,	/*!< in: direction where a possible
+					page split is made */
+	ulint		level,		/*!< in: level where the page is placed
+					in the tree */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction
+					for the allocation */
+	mtr_t*		init_mtr,	/*!< in/out: mini-transaction
+					for x-latching and initializing
+					the page */
+	dberr_t*	err)		/*!< out: error code */
+	MY_ATTRIBUTE((warn_unused_result));
+/** Empty an index page (possibly the root page). @see btr_page_create().
+@param[in,out]	block		page to be emptied
+@param[in,out]	page_zip	compressed page frame, or NULL
+@param[in]	index		index of the page
+@param[in]	level		B-tree level of the page (0=leaf)
+@param[in,out]	mtr		mini-transaction */
+void
+btr_page_empty(
+	buf_block_t*	block,
+	page_zip_des_t*	page_zip,
+	dict_index_t*	index,
+	ulint		level,
+	mtr_t*		mtr)
+	MY_ATTRIBUTE((nonnull(1, 3, 5)));
+/**************************************************************//**
+Creates a new index page (not the root, and also not
+used in page reorganization).  @see btr_page_empty(). */
+void
+btr_page_create(
+/*============*/
+	buf_block_t*	block,	/*!< in/out: page to be created */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: the B-tree level of the page */
+	mtr_t*		mtr);	/*!< in: mtr */
+
+/** Free an index page.
+@param[in,out]	index	index tree
+@param[in,out]	block	block to be freed
+@param[in,out]	mtr	mini-transaction
+@param[in]	blob	whether this is freeing a BLOB page
+@param[in]	latched	whether index->table->space->x_lock() was called */
+MY_ATTRIBUTE((nonnull))
+dberr_t btr_page_free(dict_index_t *index, buf_block_t *block, mtr_t *mtr,
+                      bool blob= false, bool space_latched= false);
+
+/**************************************************************//**
+Gets the root node of a tree and x- or s-latches it.
+@return root page, x- or s-latched */
+buf_block_t*
+btr_root_block_get(
+/*===============*/
+	dict_index_t*		index,	/*!< in: index tree */
+	rw_lock_type_t		mode,	/*!< in: either RW_S_LATCH
+					or RW_X_LATCH */
+	mtr_t*			mtr,	/*!< in: mtr */
+	dberr_t*		err);	/*!< out: error code */
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@return error code
+@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */
+dberr_t btr_page_reorganize_block(
+	ulint		z_level,/*!< in: compression level to be used
+				if dealing with compressed page */
+	buf_block_t*	block,	/*!< in/out: B-tree page */
+	dict_index_t*	index,	/*!< in: the index tree of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull, warn_unused_result));
+
+#ifdef UNIV_BTR_PRINT
+/*************************************************************//**
+Prints size info of a B-tree. */
+void
+btr_print_size(
+/*===========*/
+	dict_index_t*	index)	/*!< in: index tree */
+	MY_ATTRIBUTE((nonnull));
+/**************************************************************//**
+Prints directories and other info of all nodes in the index. */
+void
+btr_print_index(
+/*============*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		width)	/*!< in: print this many entries from start
+				and end */
+	MY_ATTRIBUTE((nonnull));
+#endif /* UNIV_BTR_PRINT */
+/************************************************************//**
+Checks the size and number of fields in a record based on the definition of
+the index.
+@return TRUE if ok */
+ibool
+btr_index_rec_validate(
+/*===================*/
+	const rec_t*		rec,		/*!< in: index record */
+	const dict_index_t*	index,		/*!< in: index */
+	ibool			dump_on_error)	/*!< in: TRUE if the function
+						should print hex dump of record
+						and page on error */
+	MY_ATTRIBUTE((warn_unused_result));
+/**************************************************************//**
+Checks the consistency of an index tree.
+@return	DB_SUCCESS if ok, error code if not */
+dberr_t
+btr_validate_index(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index */
+	const trx_t*	trx)	/*!< in: transaction or 0 */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Remove a page from the level list of pages.
+@param[in]	block		page to remove
+@param[in]	index		index tree
+@param[in,out]	mtr		mini-transaction */
+dberr_t btr_level_list_remove(const buf_block_t& block,
+                              const dict_index_t& index, mtr_t* mtr)
+  MY_ATTRIBUTE((warn_unused_result));
+
+/*************************************************************//**
+If page is the only on its level, this function moves its records to the
+father page, thus reducing the tree height.
+@return father block */
+buf_block_t*
+btr_lift_page_up(
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: page which is the only on its level;
+				must not be empty: use
+				btr_discard_only_page_on_level if the last
+				record from the page should be removed */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	dberr_t*	err)	/*!< out: error code */
+	__attribute__((nonnull));
+
+#define BTR_N_LEAF_PAGES	1
+#define BTR_TOTAL_SIZE		2
+
+#include "btr0btr.inl"
+
+/****************************************************************
+Global variable controlling if scrubbing should be performed */
+extern my_bool srv_immediate_scrub_data_uncompressed;
diff --git a/storage/innobase/include/btr0btr.inl b/storage/innobase/include/btr0btr.inl
new file mode 100644
index 00000000..9a9e39b6
--- /dev/null
+++ b/storage/innobase/include/btr0btr.inl
@@ -0,0 +1,111 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0btr.ic
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0log.h"
+
+/**************************************************************//**
+Gets the index id field of a page.
+@return index id */
+UNIV_INLINE
+index_id_t
+btr_page_get_index_id(
+/*==================*/
+	const page_t*	page)	/*!< in: index page */
+{
+	return(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID));
+}
+
+/** Set PAGE_LEVEL.
+@param[in,out]  block  buffer block
+@param[in]      level  page level
+@param[in,out]  mtr    mini-transaction */
+inline
+void btr_page_set_level(buf_block_t *block, ulint level, mtr_t *mtr)
+{
+  ut_ad(level <= BTR_MAX_NODE_LEVEL);
+  constexpr uint16_t field= PAGE_HEADER + PAGE_LEVEL;
+  byte *b= my_assume_aligned<2>(&block->page.frame[field]);
+  if (mtr->write<2,mtr_t::MAYBE_NOP>(*block, b, level) &&
+      UNIV_LIKELY_NULL(block->page.zip.data))
+    memcpy_aligned<2>(&block->page.zip.data[field], b, 2);
+}
+
+/** Set FIL_PAGE_NEXT.
+@param[in,out]  block  buffer block
+@param[in]      next   number of successor page
+@param[in,out]  mtr    mini-transaction */
+inline void btr_page_set_next(buf_block_t *block, ulint next, mtr_t *mtr)
+{
+  constexpr uint16_t field= FIL_PAGE_NEXT;
+  byte *b= my_assume_aligned<4>(&block->page.frame[field]);
+  if (mtr->write<4,mtr_t::MAYBE_NOP>(*block, b, next) &&
+      UNIV_LIKELY_NULL(block->page.zip.data))
+    memcpy_aligned<4>(&block->page.zip.data[field], b, 4);
+}
+
+/** Set FIL_PAGE_PREV.
+@param[in,out]  block  buffer block
+@param[in]      prev   number of predecessor page
+@param[in,out]  mtr    mini-transaction */
+inline void btr_page_set_prev(buf_block_t *block, ulint prev, mtr_t *mtr)
+{
+  constexpr uint16_t field= FIL_PAGE_PREV;
+  byte *b= my_assume_aligned<4>(&block->page.frame[field]);
+  if (mtr->write<4,mtr_t::MAYBE_NOP>(*block, b, prev) &&
+      UNIV_LIKELY_NULL(block->page.zip.data))
+    memcpy_aligned<4>(&block->page.zip.data[field], b, 4);
+}
+
+/**************************************************************//**
+Gets the child node file address in a node pointer.
+NOTE: the offsets array must contain all offsets for the record since
+we read the last field according to offsets and assume that it contains
+the child page number. In other words offsets must have been retrieved
+with rec_get_offsets(n_fields=ULINT_UNDEFINED).
+@return child node address */
+UNIV_INLINE
+uint32_t
+btr_node_ptr_get_child_page_no(
+/*===========================*/
+	const rec_t*	rec,	/*!< in: node pointer record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	const byte*	field;
+	ulint		len;
+
+	ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
+
+	/* The child address is in the last field */
+	field = rec_get_nth_field(rec, offsets,
+				  rec_offs_n_fields(offsets) - 1, &len);
+
+	ut_ad(len == 4);
+
+	uint32_t page_no = mach_read_from_4(field);
+	ut_ad(page_no > 1);
+
+	return(page_no);
+}
diff --git a/storage/innobase/include/btr0bulk.h b/storage/innobase/include/btr0bulk.h
new file mode 100644
index 00000000..9fcea86d
--- /dev/null
+++ b/storage/innobase/include/btr0bulk.h
@@ -0,0 +1,371 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0bulk.h
+The B-tree bulk load
+
+Created 03/11/2014 Shaohua Wang
+*************************************************************************/
+
+#ifndef btr0bulk_h
+#define btr0bulk_h
+
+#include "dict0dict.h"
+#include "rem0types.h"
+#include "page0cur.h"
+
+#include <vector>
+
+/** Innodb B-tree index fill factor for bulk load. */
+extern	uint	innobase_fill_factor;
+
+/*
+The proper function call sequence of PageBulk is as below:
+-- PageBulk::init
+-- PageBulk::insert
+-- PageBulk::finish
+-- PageBulk::compress(COMPRESSED table only)
+-- PageBulk::pageSplit(COMPRESSED table only)
+-- PageBulk::commit
+*/
+
+class PageBulk
+{
+public:
+	/** Constructor
+	@param[in]	index		B-tree index
+	@param[in]	page_no		page number
+	@param[in]	level		page level
+	@param[in]	trx_id		transaction id */
+	PageBulk(
+		dict_index_t*	index,
+		trx_id_t	trx_id,
+		uint32_t	page_no,
+		ulint		level)
+		:
+		m_heap(NULL),
+		m_index(index),
+		m_mtr(),
+		m_trx_id(trx_id),
+		m_block(NULL),
+		m_page(NULL),
+		m_page_zip(NULL),
+		m_cur_rec(NULL),
+		m_page_no(page_no),
+		m_level(level),
+		m_is_comp(dict_table_is_comp(index->table)),
+		m_heap_top(NULL),
+		m_rec_no(0),
+		m_free_space(0),
+		m_reserved_space(0),
+#ifdef UNIV_DEBUG
+		m_total_data(0),
+#endif /* UNIV_DEBUG */
+		m_modify_clock(0),
+		m_err(DB_SUCCESS)
+	{
+		ut_ad(!dict_index_is_spatial(m_index));
+		ut_ad(!m_index->table->is_temporary());
+	}
+
+	/** Deconstructor */
+	~PageBulk()
+	{
+		mem_heap_free(m_heap);
+	}
+
+	/** Initialize members and allocate page if needed and start mtr.
+	Note: must be called and only once right after constructor.
+	@return error code */
+	dberr_t init();
+
+	/** Insert a record in the page.
+	@param[in]	rec		record
+	@param[in]	offsets		record offsets */
+	inline void insert(const rec_t* rec, rec_offs* offsets);
+private:
+	/** Page format */
+	enum format { REDUNDANT, DYNAMIC, COMPRESSED };
+	/** Mark end of insertion to the page. Scan all records to set page
+	dirs, and set page header members.
+	@tparam format  the page format */
+	template<format> inline void finishPage();
+	/** Insert a record in the page.
+	@tparam format  the page format
+	@param[in,out]	rec		record
+	@param[in]	offsets		record offsets */
+	template<format> inline void insertPage(rec_t* rec, rec_offs* offsets);
+
+public:
+	/** Mark end of insertion to the page. Scan all records to set page
+	dirs, and set page header members. */
+	inline void finish();
+
+  /** @return whether finish() actually needs to do something */
+  inline bool needs_finish() const;
+
+	/** Commit mtr for a page
+	@param[in]	success		Flag whether all inserts succeed. */
+	void commit(bool success);
+
+	/** Compress if it is compressed table
+	@return	true	compress successfully or no need to compress
+	@return	false	compress failed. */
+	bool compress();
+
+	/** Check whether the record needs to be stored externally.
+	@return	true
+	@return	false */
+	bool needExt(const dtuple_t* tuple, ulint rec_size);
+
+	/** Store external record
+	@param[in]	big_rec		external recrod
+	@param[in]	offsets		record offsets
+	@return	error code */
+	dberr_t storeExt(const big_rec_t* big_rec, rec_offs* offsets);
+
+	/** Get node pointer
+	@return node pointer */
+	dtuple_t* getNodePtr();
+
+	/** Get split rec in the page. We split a page in half when compresssion
+	fails, and the split rec should be copied to the new page.
+	@return split rec */
+	rec_t*	getSplitRec();
+
+	/** Copy all records after split rec including itself.
+	@param[in]	rec	split rec */
+	void copyIn(rec_t*	split_rec);
+
+	/** Remove all records after split rec including itself.
+	@param[in]	rec	split rec	*/
+	void copyOut(rec_t*	split_rec);
+
+	/** Set next page
+	@param[in]	next_page_no	next page no */
+	inline void setNext(ulint next_page_no);
+
+	/** Set previous page
+	@param[in]	prev_page_no	previous page no */
+	inline void setPrev(ulint prev_page_no);
+
+	/** Release block by commiting mtr */
+	inline void release();
+
+	/** Start mtr and latch block */
+	inline void latch();
+
+	/** Check if required space is available in the page for the rec
+	to be inserted.	We check fill factor & padding here.
+	@param[in]	length		required length
+	@return true	if space is available */
+	inline bool isSpaceAvailable(ulint	rec_size);
+
+	/** Get page no */
+	uint32_t getPageNo() const { return m_page_no; }
+
+	/** Get page level */
+	ulint	getLevel()
+	{
+		return(m_level);
+	}
+
+	/** Get record no */
+	ulint	getRecNo()
+	{
+		return(m_rec_no);
+	}
+
+	/** Get page */
+	page_t*	getPage()
+	{
+		return(m_page);
+	}
+
+	/** Get page zip */
+	page_zip_des_t*	getPageZip()
+	{
+		return(m_page_zip);
+	}
+
+	dberr_t getError()
+	{
+		return(m_err);
+	}
+
+	void set_modified() { m_mtr.set_modified(*m_block); }
+
+	/* Memory heap for internal allocation */
+	mem_heap_t*	m_heap;
+
+private:
+	/** The index B-tree */
+	dict_index_t*	m_index;
+
+	/** The mini-transaction */
+	mtr_t		m_mtr;
+
+	/** The transaction id */
+	trx_id_t	m_trx_id;
+
+	/** The buffer block */
+	buf_block_t*	m_block;
+
+	/** The page */
+	page_t*		m_page;
+
+	/** The page zip descriptor */
+	page_zip_des_t*	m_page_zip;
+
+	/** The current rec, just before the next insert rec */
+	rec_t*		m_cur_rec;
+
+	/** The page no */
+	uint32_t	m_page_no;
+
+	/** The page level in B-tree */
+	ulint		m_level;
+
+	/** Flag: is page in compact format */
+	const bool	m_is_comp;
+
+	/** The heap top in page for next insert */
+	byte*		m_heap_top;
+
+	/** User record no */
+	ulint		m_rec_no;
+
+	/** The free space left in the page */
+	ulint		m_free_space;
+
+	/** The reserved space for fill factor */
+	ulint		m_reserved_space;
+
+	/** The padding space for compressed page */
+	ulint		m_padding_space;
+
+#ifdef UNIV_DEBUG
+	/** Total data in the page */
+	ulint		m_total_data;
+#endif /* UNIV_DEBUG */
+
+	/** The modify clock value of the buffer block
+	when the block is re-pinned */
+	ib_uint64_t     m_modify_clock;
+
+	/** Operation result DB_SUCCESS or error code */
+	dberr_t		m_err;
+};
+
+typedef std::vector<PageBulk*, ut_allocator<PageBulk*> >
+	page_bulk_vector;
+
+class BtrBulk
+{
+public:
+	/** Constructor
+	@param[in]	index		B-tree index
+	@param[in]	trx		transaction */
+	BtrBulk(
+		dict_index_t*	index,
+		const trx_t*	trx)
+		:
+		m_index(index),
+		m_trx(trx)
+	{
+		ut_ad(!dict_index_is_spatial(index));
+	}
+
+	/** Insert a tuple
+	@param[in]	tuple	tuple to insert.
+	@return error code */
+	dberr_t	insert(dtuple_t*	tuple)
+	{
+		return(insert(tuple, 0));
+	}
+
+	/** Btree bulk load finish. We commit the last page in each level
+	and copy the last page in top level to the root page of the index
+	if no error occurs.
+	@param[in]	err	whether bulk load was successful until now
+	@return error code  */
+	dberr_t finish(dberr_t	err);
+
+	/** Release all latches */
+	void release();
+
+	/** Re-latch all latches */
+	void latch();
+
+	table_name_t table_name() { return m_index->table->name; }
+
+private:
+	/** Insert a tuple to a page in a level
+	@param[in]	tuple	tuple to insert
+	@param[in]	level	B-tree level
+	@return error code */
+	dberr_t insert(dtuple_t* tuple, ulint level);
+
+	/** Split a page
+	@param[in]	page_bulk	page to split
+	@param[in]	next_page_bulk	next page
+	@return	error code */
+	dberr_t pageSplit(PageBulk* page_bulk,
+			  PageBulk* next_page_bulk);
+
+	/** Commit(finish) a page. We set next/prev page no, compress a page of
+	compressed table and split the page if compression fails, insert a node
+	pointer to father page if needed, and commit mini-transaction.
+	@param[in]	page_bulk	page to commit
+	@param[in]	next_page_bulk	next page
+	@param[in]	insert_father	flag whether need to insert node ptr
+	@return	error code */
+	dberr_t pageCommit(PageBulk* page_bulk,
+			   PageBulk* next_page_bulk,
+			   bool insert_father);
+
+	/** Abort a page when an error occurs
+	@param[in]	page_bulk	page bulk object
+	Note: we should call pageAbort for a PageBulk object, which is not in
+	m_page_bulks after pageCommit, and we will commit or abort PageBulk
+	objects in function "finish". */
+	void	pageAbort(PageBulk* page_bulk)
+	{
+		page_bulk->commit(false);
+	}
+
+	/** Log free check */
+	inline void logFreeCheck();
+
+private:
+	/** B-tree index */
+	dict_index_t*const	m_index;
+
+	/** Transaction */
+	const trx_t*const	m_trx;
+
+	/** Root page level */
+	ulint			m_root_level;
+
+	/** Page cursor vector for all level */
+	page_bulk_vector	m_page_bulks;
+};
+
+#endif
diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h
new file mode 100644
index 00000000..f6abc9f5
--- /dev/null
+++ b/storage/innobase/include/btr0cur.h
@@ -0,0 +1,855 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0cur.h
+The index tree cursor
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef btr0cur_h
+#define btr0cur_h
+
+#include "dict0dict.h"
+#include "page0cur.h"
+#include "btr0types.h"
+#include "rem0types.h"
+#include "gis0type.h"
+#include "my_base.h"
+#ifdef BTR_CUR_HASH_ADAPT
+# include "srw_lock.h"
+#endif
+
+/** Mode flags for btr_cur operations; these can be ORed */
+enum {
+	/** do no undo logging */
+	BTR_NO_UNDO_LOG_FLAG = 1,
+	/** do no record lock checking */
+	BTR_NO_LOCKING_FLAG = 2,
+	/** sys fields will be found in the update vector or inserted
+	entry */
+	BTR_KEEP_SYS_FLAG = 4,
+
+	/** no rollback */
+	BTR_NO_ROLLBACK = BTR_NO_UNDO_LOG_FLAG
+		| BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG,
+
+	/** btr_cur_pessimistic_update() must keep cursor position
+	when moving columns to big_rec */
+	BTR_KEEP_POS_FLAG = 8,
+	/** the caller is creating the index or wants to bypass the
+	index->info.online creation log */
+	BTR_CREATE_FLAG = 16,
+	/** the caller of btr_cur_optimistic_update() or
+	btr_cur_update_in_place() will take care of
+	updating IBUF_BITMAP_FREE */
+	BTR_KEEP_IBUF_BITMAP = 32
+};
+
+#include "que0types.h"
+#include "row0types.h"
+
+#define btr_cur_get_page_cur(cursor)	(&(cursor)->page_cur)
+#define btr_cur_get_block(cursor)	((cursor)->page_cur.block)
+#define btr_cur_get_rec(cursor)	((cursor)->page_cur.rec)
+
+/*********************************************************//**
+Returns the compressed page on which the tree cursor is positioned.
+@return pointer to compressed page, or NULL if the page is not compressed */
+UNIV_INLINE
+page_zip_des_t*
+btr_cur_get_page_zip(
+/*=================*/
+	btr_cur_t*	cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the page of a tree cursor.
+@return pointer to page */
+UNIV_INLINE
+page_t*
+btr_cur_get_page(
+/*=============*/
+	btr_cur_t*	cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the index of a cursor.
+@param cursor b-tree cursor
+@return index */
+#define btr_cur_get_index(cursor) ((cursor)->index())
+/*********************************************************//**
+Positions a tree cursor at a given record. */
+UNIV_INLINE
+void
+btr_cur_position(
+/*=============*/
+	dict_index_t*	index,	/*!< in: index */
+	rec_t*		rec,	/*!< in: record in tree */
+	buf_block_t*	block,	/*!< in: buffer block of rec */
+	btr_cur_t*	cursor);/*!< in: cursor */
+
+/** Load the instant ALTER TABLE metadata from the clustered index
+when loading a table definition.
+@param[in,out]	table	table definition from the data dictionary
+@return	error code
+@retval	DB_SUCCESS	if no error occurred */
+dberr_t
+btr_cur_instant_init(dict_table_t* table)
+	ATTRIBUTE_COLD __attribute__((nonnull, warn_unused_result));
+
+/** Initialize the n_core_null_bytes on first access to a clustered
+index root page.
+@param[in]	index	clustered index that is on its first access
+@param[in]	page	clustered index root page
+@return	whether the page is corrupted */
+bool
+btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
+	ATTRIBUTE_COLD __attribute__((nonnull, warn_unused_result));
+
+MY_ATTRIBUTE((warn_unused_result))
+/********************************************************************//**
+Searches an index tree and positions a tree cursor on a given non-leaf level.
+NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
+to node pointer page number fields on the upper levels of the tree!
+cursor->up_match and cursor->low_match both will have sensible values.
+Cursor is left at the place where an insert of the
+search tuple should be performed in the B-tree. InnoDB does an insert
+immediately after the cursor. Thus, the cursor may end up on a user record,
+or on a page infimum record.
+@param level      the tree level of search
+@param tuple      data tuple; NOTE: n_fields_cmp in tuple must be set so that
+                  it cannot get compared to the node ptr page number field!
+@param latch      RW_S_LATCH or RW_X_LATCH
+@param cursor     tree cursor; the cursor page is s- or x-latched, but see also
+                  above!
+@param mtr        mini-transaction
+@return DB_SUCCESS on success or error code otherwise */
+dberr_t btr_cur_search_to_nth_level(ulint level,
+                                    const dtuple_t *tuple,
+                                    rw_lock_type_t rw_latch,
+                                    btr_cur_t *cursor, mtr_t *mtr);
+
+/*************************************************************//**
+Tries to perform an insert to a page in an index tree, next to cursor.
+It is assumed that mtr holds an x-latch on the page. The operation does
+not succeed if there is too little space on the page. If there is just
+one record on the page, the insert will always succeed; this is to
+prevent trying to split a page with just one record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL, or error number */
+dberr_t
+btr_cur_optimistic_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags: if not
+				zero, the parameters index and thr should be
+				specified */
+	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
+				cursor stays valid */
+	rec_offs**	offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap */
+	dtuple_t*	entry,	/*!< in/out: entry to insert */
+	rec_t**		rec,	/*!< out: pointer to inserted record if
+				succeed */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr,	/*!< in/out: query thread; can be NULL if
+				!(~flags
+				& (BTR_NO_LOCKING_FLAG
+				| BTR_NO_UNDO_LOG_FLAG)) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction;
+				if this function returns DB_SUCCESS on
+				a leaf page of a secondary index in a
+				compressed tablespace, the caller must
+				mtr_commit(mtr) before latching
+				any further pages */
+	MY_ATTRIBUTE((nonnull(2,3,4,5,6,7,10), warn_unused_result));
+/*************************************************************//**
+Performs an insert on a page of an index tree. It is assumed that mtr
+holds an x-latch on the tree and on the cursor page. If the insert is
+made on the leaf level, to avoid deadlocks, mtr must also own x-latches
+to brothers of page, if those brothers exist.
+@return DB_SUCCESS or error number */
+dberr_t
+btr_cur_pessimistic_insert(
+/*=======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags: if not
+				zero, the parameter thr should be
+				specified; if no undo logging is specified,
+				then the caller must have reserved enough
+				free extents in the file space so that the
+				insertion will certainly succeed */
+	btr_cur_t*	cursor,	/*!< in: cursor after which to insert;
+				cursor stays valid */
+	rec_offs**	offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
+				that can be emptied */
+	dtuple_t*	entry,	/*!< in/out: entry to insert */
+	rec_t**		rec,	/*!< out: pointer to inserted record if
+				succeed */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr,	/*!< in/out: query thread; can be NULL if
+				!(~flags
+				& (BTR_NO_LOCKING_FLAG
+				| BTR_NO_UNDO_LOG_FLAG)) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull(2,3,4,5,6,7,10), warn_unused_result));
+/*************************************************************//**
+See if there is enough place in the page modification log to log
+an update-in-place.
+
+@retval false if out of space; IBUF_BITMAP_FREE will be reset
+outside mtr if the page was recompressed
+@retval true if enough place;
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
+a secondary index leaf page. This has to be done either within the
+same mini-transaction, or by invoking ibuf_reset_free_bits() before
+mtr_commit(mtr). */
+bool
+btr_cur_update_alloc_zip_func(
+/*==========================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	page_cur_t*	cursor,	/*!< in/out: B-tree page cursor */
+#ifdef UNIV_DEBUG
+	rec_offs*	offsets,/*!< in/out: offsets of the cursor record */
+#endif /* UNIV_DEBUG */
+	ulint		length,	/*!< in: size needed */
+	bool		create,	/*!< in: true=delete-and-insert,
+				false=update-in-place */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+#ifdef UNIV_DEBUG
+# define btr_cur_update_alloc_zip(page_zip,cursor,offsets,len,cr,mtr) \
+	btr_cur_update_alloc_zip_func(page_zip,cursor,offsets,len,cr,mtr)
+#else /* UNIV_DEBUG */
+# define btr_cur_update_alloc_zip(page_zip,cursor,offsets,len,cr,mtr) \
+	btr_cur_update_alloc_zip_func(page_zip,cursor,len,cr,mtr)
+#endif /* UNIV_DEBUG */
+
+/** Apply an update vector to a record. No field size changes are allowed.
+
+This is usually invoked on a clustered index. The only use case for a
+secondary index is row_ins_sec_index_entry_by_modify() or its
+counterpart in ibuf_insert_to_index_page().
+@param[in,out]  rec     index record
+@param[in]      index   the index of the record
+@param[in]      offsets rec_get_offsets(rec, index)
+@param[in]      update  update vector
+@param[in,out]  block   index page
+@param[in,out]  mtr     mini-transaction */
+void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index,
+                              const rec_offs *offsets, const upd_t *update,
+                              buf_block_t *block, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
+/*************************************************************//**
+Updates a record when the update causes no size changes in its fields.
+@return locking or undo log related error code, or
+@retval DB_SUCCESS on success
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
+dberr_t
+btr_cur_update_in_place(
+/*====================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
+				cursor stays valid and positioned on the
+				same record */
+	rec_offs*	offsets,/*!< in/out: offsets on cursor->page_cur.rec */
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
+				is a secondary index, the caller must
+				mtr_commit(mtr) before latching any
+				further pages */
+	MY_ATTRIBUTE((warn_unused_result, nonnull));
+/*************************************************************//**
+Tries to update a record on a page in an index tree. It is assumed that mtr
+holds an x-latch on the page. The operation does not succeed if there is too
+little space on the page or if the update would result in too empty a page,
+so that tree compression is recommended.
+@return error code, including
+@retval DB_SUCCESS on success
+@retval DB_OVERFLOW if the updated record does not fit
+@retval DB_UNDERFLOW if the page would become too empty
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page */
+dberr_t
+btr_cur_optimistic_update(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
+				cursor stays valid and positioned on the
+				same record */
+	rec_offs**	offsets,/*!< out: offsets on cursor->page_cur.rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to NULL or memory heap */
+	const upd_t*	update,	/*!< in: update vector; this must also
+				contain trx id and roll ptr fields */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
+				is a secondary index, the caller must
+				mtr_commit(mtr) before latching any
+				further pages */
+	MY_ATTRIBUTE((warn_unused_result, nonnull));
+/*************************************************************//**
+Performs an update of a record on a page of a tree. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. If the
+update is made on the leaf level, to avoid deadlocks, mtr must also
+own x-latches to brothers of page, if those brothers exist.
+@return DB_SUCCESS or error code */
+dberr_t
+btr_cur_pessimistic_update(
+/*=======================*/
+	ulint		flags,	/*!< in: undo logging, locking, and rollback
+				flags */
+	btr_cur_t*	cursor,	/*!< in/out: cursor on the record to update;
+				cursor may become invalid if *big_rec == NULL
+				|| !(flags & BTR_KEEP_POS_FLAG) */
+	rec_offs**	offsets,/*!< out: offsets on cursor->page_cur.rec */
+	mem_heap_t**	offsets_heap,
+				/*!< in/out: pointer to memory heap
+				that can be emptied */
+	mem_heap_t*	entry_heap,
+				/*!< in/out: memory heap for allocating
+				big_rec and the index tuple */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller */
+	upd_t*		update,	/*!< in/out: update vector; this is allowed to
+				also contain trx id and roll ptr fields.
+				Non-updated columns that are moved offpage will
+				be appended to this. */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; must be committed
+				before latching any further pages */
+	MY_ATTRIBUTE((warn_unused_result, nonnull));
+/***********************************************************//**
+Marks a clustered index record deleted. Writes an undo log record to
+undo log on this delete marking. Writes in the trx id field the id
+of the deleting transaction, and in the roll ptr field pointer to the
+undo log record created.
+@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
+dberr_t
+btr_cur_del_mark_set_clust_rec(
+/*===========================*/
+	buf_block_t*	block,	/*!< in/out: buffer block of the record */
+	rec_t*		rec,	/*!< in/out: record */
+	dict_index_t*	index,	/*!< in: clustered index of the record */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec) */
+	que_thr_t*	thr,	/*!< in: query thread */
+	const dtuple_t*	entry,	/*!< in: dtuple for the deleting record */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Tries to compress a page of the tree if it seems useful. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. To avoid
+deadlocks, mtr must also own x-latches to brothers of page, if those
+brothers exist. NOTE: it is assumed that the caller has reserved enough
+free extents so that the compression will always succeed if done!
+@return whether compression occurred */
+bool
+btr_cur_compress_if_useful(
+/*=======================*/
+	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to compress;
+				cursor does not stay valid if !adjust and
+				compression occurs */
+	bool		adjust,	/*!< in: whether the cursor position should be
+				adjusted even when compression occurs */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull));
+/*******************************************************//**
+Removes the record on which the tree cursor is positioned. It is assumed
+that the mtr has an x-latch on the page where the cursor is positioned,
+but no latch on the whole tree.
+@return error code
+@retval DB_FAIL if the page would become too empty */
+dberr_t
+btr_cur_optimistic_delete(
+/*======================*/
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to delete;
+				cursor stays valid: if deletion succeeds,
+				on function exit it points to the successor
+				of the deleted record */
+	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
+	mtr_t*		mtr)	/*!< in: mtr; if this function returns
+				TRUE on a leaf page of a secondary
+				index, the mtr must be committed
+				before latching any further pages */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Removes the record on which the tree cursor is positioned. Tries
+to compress the page if its fillfactor drops below a threshold
+or if it is the only page on the level. It is assumed that mtr holds
+an x-latch on the tree and on the cursor page. To avoid deadlocks,
+mtr must also own x-latches to brothers of page, if those brothers
+exist.
+@return TRUE if compression occurred */
+ibool
+btr_cur_pessimistic_delete(
+/*=======================*/
+	dberr_t*		err,	/*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
+				the latter may occur because we may have
+				to update node pointers on upper levels,
+				and in the case of variable length keys
+				these may actually grow in size */
+	ibool		has_reserved_extents, /*!< in: TRUE if the
+				caller has already reserved enough free
+				extents so that he knows that the operation
+				will succeed */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to delete;
+				if compression does not occur, the cursor
+				stays valid: it points to successor of
+				deleted record on function exit */
+	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
+	bool		rollback,/*!< in: performing rollback? */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((nonnull));
+/** Delete the node pointer in a parent page.
+@param[in,out]	parent	cursor pointing to parent record
+@param[in,out]	mtr	mini-transaction */
+dberr_t btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************//**
+Parses a redo log record of updating a record in-place.
+@return end of log record or NULL */
+byte*
+btr_cur_parse_update_in_place(
+/*==========================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	page_t*		page,	/*!< in/out: page or NULL */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	dict_index_t*	index);	/*!< in: index corresponding to page */
+/** Arguments to btr_estimate_n_rows_in_range */
+struct btr_pos_t
+{
+  btr_pos_t(dtuple_t *arg_tuple,
+            page_cur_mode_t arg_mode,
+            page_id_t arg_page_id)
+  :tuple(arg_tuple), mode(arg_mode), page_id(arg_page_id)
+  {}
+
+  dtuple_t*       tuple;       /* Range start or end. May be NULL */
+  page_cur_mode_t mode;        /* search mode for range */
+  page_id_t       page_id;     /* Out: Page where we found the tuple */
+};
+
+/** Estimates the number of rows in a given index range. Do search in the
+left page, then if there are pages between left and right ones, read a few
+pages to the right, if the right page is reached, fetch it and count the exact
+number of rows, otherwise count the estimated(see
+btr_estimate_n_rows_in_range_on_level() for details) number if rows, and
+fetch the right page. If leaves are reached, unlatch non-leaf pages except
+the right leaf parent. After the right leaf page is fetched, commit mtr.
+@param[in]  index index
+@param[in]  range_start range start
+@param[in]  range_end   range end
+@return estimated number of rows; */
+ha_rows btr_estimate_n_rows_in_range(dict_index_t *index,
+                                     btr_pos_t *range_start,
+                                     btr_pos_t *range_end);
+
+/** Gets the externally stored size of a record, in units of a database page.
+@param[in]	rec	record
+@param[in]	offsets	array returned by rec_get_offsets()
+@return externally stored part, in units of a database page */
+ulint
+btr_rec_get_externally_stored_len(
+	const rec_t*	rec,
+	const rec_offs*	offsets);
+
+/*******************************************************************//**
+Marks non-updated off-page fields as disowned by this record. The ownership
+must be transferred to the updated record which is inserted elsewhere in the
+index tree. In purge only the owner of externally stored field is allowed
+to free the field. */
+void
+btr_cur_disown_inherited_fields(
+/*============================*/
+	buf_block_t*	block,	/*!< in/out: index page */
+	rec_t*		rec,	/*!< in/out: record in a clustered index */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	const upd_t*	update,	/*!< in: update vector */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull(2,3,4,5,6)));
+
+/** Operation code for btr_store_big_rec_extern_fields(). */
+enum blob_op {
+	/** Store off-page columns for a freshly inserted record */
+	BTR_STORE_INSERT = 0,
+	/** Store off-page columns for an insert by update */
+	BTR_STORE_INSERT_UPDATE,
+	/** Store off-page columns for an update */
+	BTR_STORE_UPDATE,
+	/** Store off-page columns for a freshly inserted record by bulk */
+	BTR_STORE_INSERT_BULK
+};
+
+/*******************************************************************//**
+Determine if an operation on off-page columns is an update.
+@return TRUE if op != BTR_STORE_INSERT */
+UNIV_INLINE
+ibool
+btr_blob_op_is_update(
+/*==================*/
+	enum blob_op	op)	/*!< in: operation */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*******************************************************************//**
+Stores the fields in big_rec_vec to the tablespace and puts pointers to
+them in rec.  The extern flags in rec will have to be set beforehand.
+The fields are stored on pages allocated from leaf node
+file segment of the index tree.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+dberr_t
+btr_store_big_rec_extern_fields(
+/*============================*/
+	btr_pcur_t*	pcur,		/*!< in: a persistent cursor */
+	rec_offs*	offsets,	/*!< in/out: rec_get_offsets() on
+					pcur. the "external storage" flags
+					in offsets will correctly correspond
+					to rec when this function returns */
+	const big_rec_t*big_rec_vec,	/*!< in: vector containing fields
+					to be stored externally */
+	mtr_t*		btr_mtr,	/*!< in/out: mtr containing the
+					latches to the clustered index. can be
+					committed and restarted. */
+	enum blob_op	op)		/*! in: operation code */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*******************************************************************//**
+Frees the space in an externally stored field to the file space
+management if the field in data is owned the externally stored field,
+in a rollback we may have the additional condition that the field must
+not be inherited. */
+void
+btr_free_externally_stored_field(
+/*=============================*/
+	dict_index_t*	index,		/*!< in: index of the data, the index
+					tree MUST be X-latched; if the tree
+					height is 1, then also the root page
+					must be X-latched! (this is relevant
+					in the case this function is called
+					from purge where 'data' is located on
+					an undo log page, not an index
+					page) */
+	byte*		field_ref,	/*!< in/out: field reference */
+	const rec_t*	rec,		/*!< in: record containing field_ref, for
+					page_zip_write_blob_ptr(), or NULL */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index),
+					or NULL */
+	buf_block_t*	block,		/*!< in/out: page of field_ref */
+	ulint		i,		/*!< in: field number of field_ref;
+					ignored if rec == NULL */
+	bool		rollback,	/*!< in: performing rollback? */
+	mtr_t*		local_mtr)	/*!< in: mtr containing the latch */
+	MY_ATTRIBUTE((nonnull(1,2,5,8)));
+
+/** Copies the prefix of an externally stored field of a record.
+The clustered index record must be protected by a lock or a page latch.
+@param[out]	buf		the field, or a prefix of it
+@param[in]	len		length of buf, in bytes
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	data		'internally' stored part of the field
+containing also the reference to the external part; must be protected by
+a lock or a page latch
+@param[in]	local_len	length of data, in bytes
+@return the length of the copied field, or 0 if the column was being
+or has been deleted */
+ulint
+btr_copy_externally_stored_field_prefix(
+	byte*			buf,
+	ulint			len,
+	ulint			zip_size,
+	const byte*		data,
+	ulint			local_len);
+
+/** Copies an externally stored field of a record to mem heap.
+The clustered index record must be protected by a lock or a page latch.
+@param[out]	len		length of the whole field
+@param[in]	data		'internally' stored part of the field
+containing also the reference to the external part; must be protected by
+a lock or a page latch
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	local_len	length of data
+@param[in,out]	heap		mem heap
+@return the whole field copied to heap */
+byte*
+btr_copy_externally_stored_field(
+	ulint*			len,
+	const byte*		data,
+	ulint			zip_size,
+	ulint			local_len,
+	mem_heap_t*		heap);
+
+/** Copies an externally stored field of a record to mem heap.
+@param[in]	rec		record in a clustered index; must be
+protected by a lock or a page latch
+@param[in]	offset		array returned by rec_get_offsets()
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	no		field number
+@param[out]	len		length of the field
+@param[in,out]	heap		mem heap
+@return the field copied to heap, or NULL if the field is incomplete */
+byte*
+btr_rec_copy_externally_stored_field(
+	const rec_t*		rec,
+	const rec_offs*		offsets,
+	ulint			zip_size,
+	ulint			no,
+	ulint*			len,
+	mem_heap_t*		heap);
+
+/*######################################################################*/
+
+/** In the pessimistic delete, if the page data size drops below this
+limit, merging it to a neighbor is tried */
+#define BTR_CUR_PAGE_COMPRESS_LIMIT(index) \
+	((srv_page_size * (ulint)((index)->merge_threshold)) / 100)
+
+/** A slot in the path array. We store here info on a search path down the
+tree. Each slot contains data on a single level of the tree. */
+struct btr_path_t {
+	/* Assume a page like:
+	records:             (inf, a, b, c, d, sup)
+	index of the record:    0, 1, 2, 3, 4, 5
+	*/
+
+	/** Index of the record where the page cursor stopped on this level
+	(index in alphabetical order). Value ULINT_UNDEFINED denotes array
+	end. In the above example, if the search stopped on record 'c', then
+	nth_rec will be 3. */
+	ulint	nth_rec;
+
+	/** Number of the records on the page, not counting inf and sup.
+	In the above example n_recs will be 4. */
+	ulint	n_recs;
+
+	/** Number of the page containing the record. */
+	uint32_t page_no;
+
+	/** Level of the page. If later we fetch the page under page_no
+	and it is no different level then we know that the tree has been
+	reorganized. */
+	ulint	page_level;
+};
+
+#define BTR_PATH_ARRAY_N_SLOTS	250	/*!< size of path array (in slots) */
+
+/** Values for the flag documenting the used search method */
+enum btr_cur_method {
+	BTR_CUR_HASH = 1,	/*!< successful shortcut using
+				the hash index */
+	BTR_CUR_HASH_FAIL,	/*!< failure using hash, success using
+				binary search: the misleading hash
+				reference is stored in the field
+				hash_node, and might be necessary to
+				update */
+	BTR_CUR_BINARY,		/*!< success using the binary search */
+	BTR_CUR_INSERT_TO_IBUF,	/*!< performed the intended insert to
+				the insert buffer */
+	BTR_CUR_DEL_MARK_IBUF,	/*!< performed the intended delete
+				mark in the insert/delete buffer */
+	BTR_CUR_DELETE_IBUF,	/*!< performed the intended delete in
+				the insert/delete buffer */
+	BTR_CUR_DELETE_REF	/*!< row_purge_poss_sec() failed */
+};
+
+/** The tree cursor: the definition appears here only for the compiler
+to know struct size! */
+struct btr_cur_t {
+	page_cur_t	page_cur;	/*!< page cursor */
+	purge_node_t*	purge_node;	/*!< purge node, for BTR_DELETE */
+	/*------------------------------*/
+	que_thr_t*	thr;		/*!< this field is only used
+					when search_leaf()
+					is called for an index entry
+					insertion: the calling query
+					thread is passed here to be
+					used in the insert buffer */
+	/*------------------------------*/
+	/** The following fields are used in
+	search_leaf() to pass information: */
+	/* @{ */
+	enum btr_cur_method	flag;	/*!< Search method used */
+	ulint		tree_height;	/*!< Tree height if the search is done
+					for a pessimistic insert or update
+					operation */
+	ulint		up_match;	/*!< If the search mode was PAGE_CUR_LE,
+					the number of matched fields to the
+					the first user record to the right of
+					the cursor record after search_leaf();
+					for the mode PAGE_CUR_GE, the matched
+					fields to the first user record AT THE
+					CURSOR or to the right of it;
+					NOTE that the up_match and low_match
+					values may exceed the correct values
+					for comparison to the adjacent user
+					record if that record is on a
+					different leaf page! (See the note in
+					row_ins_duplicate_error_in_clust.) */
+	ulint		up_bytes;	/*!< number of matched bytes to the
+					right at the time cursor positioned;
+					only used internally in searches: not
+					defined after the search */
+	ulint		low_match;	/*!< if search mode was PAGE_CUR_LE,
+					the number of matched fields to the
+					first user record AT THE CURSOR or
+					to the left of it after search_leaf();
+					NOT defined for PAGE_CUR_GE or any
+					other search modes; see also the NOTE
+					in up_match! */
+	ulint		low_bytes;	/*!< number of matched bytes to the
+					left at the time cursor positioned;
+					only used internally in searches: not
+					defined after the search */
+	ulint		n_fields;	/*!< prefix length used in a hash
+					search if hash_node != NULL */
+	ulint		n_bytes;	/*!< hash prefix bytes if hash_node !=
+					NULL */
+	ulint		fold;		/*!< fold value used in the search if
+					flag is BTR_CUR_HASH */
+	/* @} */
+	btr_path_t*	path_arr;	/*!< in estimating the number of
+					rows in range, we store in this array
+					information of the path through
+					the tree */
+	rtr_info_t*	rtr_info;	/*!< rtree search info */
+  btr_cur_t() { memset((void*) this, 0, sizeof *this); }
+
+  dict_index_t *index() const { return page_cur.index; }
+  buf_block_t *block() const { return page_cur.block; }
+
+  /** Open the cursor on the first or last record.
+  @param first         true=first record, false=last record
+  @param index         B-tree
+  @param latch_mode    which latches to acquire
+  @param mtr           mini-transaction
+  @return error code */
+  dberr_t open_leaf(bool first, dict_index_t *index, btr_latch_mode latch_mode,
+                    mtr_t *mtr);
+
+  /** Search the leaf page record corresponding to a key.
+  @param tuple      key to search for, with correct n_fields_cmp
+  @param mode       search mode; PAGE_CUR_LE for unique prefix or for inserting
+  @param latch_mode latch mode
+  @param mtr        mini-transaction
+  @return error code */
+  dberr_t search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
+                      btr_latch_mode latch_mode, mtr_t *mtr);
+
+  /** Search the leaf page record corresponding to a key, exclusively latching
+  all sibling pages on the way.
+  @param tuple      key to search for, with correct n_fields_cmp
+  @param mode       search mode; PAGE_CUR_LE for unique prefix or for inserting
+  @param mtr        mini-transaction
+  @return error code */
+  dberr_t pessimistic_search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
+                                  mtr_t *mtr);
+
+  /** Open the cursor at a random leaf page record.
+  @param offsets   temporary memory for rec_get_offsets()
+  @param heap      memory heap for rec_get_offsets()
+  @param mtr       mini-transaction
+  @return error code */
+  inline dberr_t open_random_leaf(rec_offs *&offsets, mem_heap_t *& heap,
+                                  mtr_t &mtr);
+};
+
+/** Modify the delete-mark flag of a record.
+@tparam         flag    the value of the delete-mark flag
+@param[in,out]  block   buffer block
+@param[in,out]  rec     record on a physical index page
+@param[in,out]  mtr     mini-transaction  */
+template<bool flag>
+void btr_rec_set_deleted(buf_block_t *block, rec_t *rec, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
+
+/** If pessimistic delete fails because of lack of file space, there
+is still a good change of success a little later.  Try this many
+times. */
+#define BTR_CUR_RETRY_DELETE_N_TIMES	100
+/** If pessimistic delete fails because of lack of file space, there
+is still a good change of success a little later.  Sleep this time
+between retries. */
+static const std::chrono::milliseconds BTR_CUR_RETRY_SLEEP_TIME(50);
+
+/** The reference in a field for which data is stored on a different page.
+The reference is at the end of the 'locally' stored part of the field.
+'Locally' means storage in the index record.
+We store locally a long enough prefix of each column so that we can determine
+the ordering parts of each index record without looking into the externally
+stored part. */
+/*-------------------------------------- @{ */
+#define BTR_EXTERN_SPACE_ID		0U	/*!< space id where stored */
+#define BTR_EXTERN_PAGE_NO		4U	/*!< page no where stored */
+#define BTR_EXTERN_OFFSET		8U	/*!< offset of BLOB header
+						on that page */
+#define BTR_EXTERN_LEN			12U	/*!< 8 bytes containing the
+						length of the externally
+						stored part of the BLOB.
+						The 2 highest bits are
+						reserved to the flags below. */
+/*-------------------------------------- @} */
+/* #define BTR_EXTERN_FIELD_REF_SIZE	20 // moved to btr0types.h */
+
+/** The most significant bit of BTR_EXTERN_LEN (i.e., the most
+significant bit of the byte at smallest address) is set to 1 if this
+field does not 'own' the externally stored field; only the owner field
+is allowed to free the field in purge! */
+#define BTR_EXTERN_OWNER_FLAG		128U
+/** If the second most significant bit of BTR_EXTERN_LEN (i.e., the
+second most significant bit of the byte at smallest address) is 1 then
+it means that the externally stored field was inherited from an
+earlier version of the row.  In rollback we are not allowed to free an
+inherited external field. */
+#define BTR_EXTERN_INHERITED_FLAG	64U
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** Number of searches down the B-tree in btr_cur_t::search_leaf(). */
+extern ib_counter_t<ulint, ib_counter_element_t>	btr_cur_n_non_sea;
+/** Old value of btr_cur_n_non_sea.  Copied by
+srv_refresh_innodb_monitor_stats().  Referenced by
+srv_printf_innodb_monitor(). */
+extern ulint	btr_cur_n_non_sea_old;
+/** Number of successful adaptive hash index lookups in
+btr_cur_t::search_leaf(). */
+extern ib_counter_t<ulint, ib_counter_element_t>	btr_cur_n_sea;
+/** Old value of btr_cur_n_sea.  Copied by
+srv_refresh_innodb_monitor_stats().  Referenced by
+srv_printf_innodb_monitor(). */
+extern ulint	btr_cur_n_sea_old;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#ifdef UNIV_DEBUG
+/* Flag to limit optimistic insert records */
+extern uint	btr_cur_limit_optimistic_insert_debug;
+#endif /* UNIV_DEBUG */
+
+#include "btr0cur.inl"
+
+#endif
diff --git a/storage/innobase/include/btr0cur.inl b/storage/innobase/include/btr0cur.inl
new file mode 100644
index 00000000..955cf342
--- /dev/null
+++ b/storage/innobase/include/btr0cur.inl
@@ -0,0 +1,170 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0cur.ic
+The index tree cursor
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#include "btr0btr.h"
+
+#ifdef UNIV_DEBUG
+# define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE)\
+if (btr_cur_limit_optimistic_insert_debug > 1\
+    && (NREC) >= btr_cur_limit_optimistic_insert_debug) {\
+        CODE;\
+}
+#else
+# define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE)
+#endif /* UNIV_DEBUG */
+
+/*********************************************************//**
+Returns the compressed page on which the tree cursor is positioned.
+@return pointer to compressed page, or NULL if the page is not compressed */
+UNIV_INLINE
+page_zip_des_t*
+btr_cur_get_page_zip(
+/*=================*/
+	btr_cur_t*	cursor)	/*!< in: tree cursor */
+{
+	return(buf_block_get_page_zip(btr_cur_get_block(cursor)));
+}
+
+/*********************************************************//**
+Returns the page of a tree cursor.
+@return pointer to page */
+UNIV_INLINE
+page_t*
+btr_cur_get_page(
+/*=============*/
+	btr_cur_t*	cursor)	/*!< in: tree cursor */
+{
+	return(page_align(page_cur_get_rec(&(cursor->page_cur))));
+}
+
+/*********************************************************//**
+Positions a tree cursor at a given record. */
+UNIV_INLINE
+void
+btr_cur_position(
+/*=============*/
+	dict_index_t*	index,	/*!< in: index */
+	rec_t*		rec,	/*!< in: record in tree */
+	buf_block_t*	block,	/*!< in: buffer block of rec */
+	btr_cur_t*	cursor)	/*!< out: cursor */
+{
+	page_cur_position(rec, block, btr_cur_get_page_cur(cursor));
+	cursor->page_cur.index = index;
+}
+
+/*********************************************************************//**
+Checks if compressing an index page where a btr cursor is placed makes
+sense.
+@return TRUE if compression is recommended */
+UNIV_INLINE
+ibool
+btr_cur_compress_recommendation(
+/*============================*/
+	btr_cur_t*	cursor,	/*!< in: btr cursor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	const page_t*	page;
+
+	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+					 MTR_MEMO_PAGE_X_FIX));
+
+	page = btr_cur_get_page(cursor);
+
+	LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page) * 2U,
+				      return(FALSE));
+
+	if (!page_has_siblings(page)
+	    || page_get_data_size(page)
+	    < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index())) {
+
+		/* The page fillfactor has dropped below a predefined
+		minimum value OR the level in the B-tree contains just
+		one page: we recommend compression if this is not the
+		root page. */
+
+		return cursor->index()->page
+			!= btr_cur_get_block(cursor)->page.id().page_no();
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Checks if the record on which the cursor is placed can be deleted without
+making tree compression necessary (or, recommended).
+@return TRUE if can be deleted without recommended compression */
+UNIV_INLINE
+ibool
+btr_cur_can_delete_without_compress(
+/*================================*/
+	btr_cur_t*	cursor,	/*!< in: btr cursor */
+	ulint		rec_size,/*!< in: rec_get_size(btr_cur_get_rec(cursor))*/
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*		page;
+
+	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+					 MTR_MEMO_PAGE_X_FIX));
+
+	page = btr_cur_get_page(cursor);
+
+	if (!page_has_siblings(page) || page_get_n_recs(page) < 2
+	    || page_get_data_size(page) - rec_size
+	    < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index())) {
+
+		/* The page fillfactor will drop below a predefined
+		minimum value, OR the level in the B-tree contains just
+		one page, OR the page will become empty: we recommend
+		compression if this is not the root page. */
+
+		return cursor->index()->page
+			== btr_cur_get_block(cursor)->page.id().page_no();
+	}
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Determine if an operation on off-page columns is an update.
+@return TRUE if op != BTR_STORE_INSERT */
+UNIV_INLINE
+ibool
+btr_blob_op_is_update(
+/*==================*/
+	enum blob_op	op)	/*!< in: operation */
+{
+	switch (op) {
+	case BTR_STORE_INSERT:
+	case BTR_STORE_INSERT_BULK:
+		return(FALSE);
+	case BTR_STORE_INSERT_UPDATE:
+	case BTR_STORE_UPDATE:
+		return(TRUE);
+	}
+
+	ut_ad(0);
+	return(FALSE);
+}
diff --git a/storage/innobase/include/btr0defragment.h b/storage/innobase/include/btr0defragment.h
new file mode 100644
index 00000000..0523829b
--- /dev/null
+++ b/storage/innobase/include/btr0defragment.h
@@ -0,0 +1,65 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved.
+Copyright (C) 2014, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#ifndef btr0defragment_h
+#define btr0defragment_h
+
+#include "btr0pcur.h"
+
+/* Max number of pages to consider at once during defragmentation. */
+#define BTR_DEFRAGMENT_MAX_N_PAGES	32
+
+/** stats in btr_defragment */
+extern Atomic_counter<ulint> btr_defragment_compression_failures;
+extern Atomic_counter<ulint> btr_defragment_failures;
+extern Atomic_counter<ulint> btr_defragment_count;
+
+/******************************************************************//**
+Initialize defragmentation. */
+void
+btr_defragment_init(void);
+/******************************************************************//**
+Shutdown defragmentation. */
+void
+btr_defragment_shutdown();
+/******************************************************************//**
+Check whether the given index is in btr_defragment_wq. */
+bool
+btr_defragment_find_index(
+	dict_index_t*	index);	/*!< Index to find. */
+/** Defragment an index.
+@param pcur      persistent cursor
+@param thd       current session, for checking thd_killed()
+@return whether the operation was interrupted */
+bool btr_defragment_add_index(btr_pcur_t *pcur, THD *thd);
+/******************************************************************//**
+When table is dropped, this function is called to mark a table as removed in
+btr_efragment_wq. The difference between this function and the remove_index
+function is this will not NULL the event. */
+void
+btr_defragment_remove_table(
+	dict_table_t*	table);	/*!< Index to be removed. */
+/*********************************************************************//**
+Check whether we should save defragmentation statistics to persistent storage.*/
+void btr_defragment_save_defrag_stats_if_needed(dict_index_t *index);
+
+/* Stop defragmentation.*/
+void btr_defragment_end();
+extern bool btr_defragment_active;
+#endif
diff --git a/storage/innobase/include/btr0pcur.h b/storage/innobase/include/btr0pcur.h
new file mode 100644
index 00000000..c66a3bfa
--- /dev/null
+++ b/storage/innobase/include/btr0pcur.h
@@ -0,0 +1,459 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0pcur.h
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "dict0dict.h"
+#include "btr0cur.h"
+#include "buf0block_hint.h"
+#include "btr0btr.h"
+#include "gis0rtree.h"
+
+/* Relative positions for a stored cursor position */
+enum btr_pcur_pos_t {
+	BTR_PCUR_ON		= 1,
+	BTR_PCUR_BEFORE		= 2,
+	BTR_PCUR_AFTER		= 3,
+/* Note that if the tree is not empty, btr_pcur_store_position does not
+use the following, but only uses the above three alternatives, where the
+position is stored relative to a specific record: this makes implementation
+of a scroll cursor easier */
+	BTR_PCUR_BEFORE_FIRST_IN_TREE	= 4,	/* in an empty tree */
+	BTR_PCUR_AFTER_LAST_IN_TREE	= 5	/* in an empty tree */
+};
+
+/**************************************************************//**
+Resets a persistent cursor object, freeing ::old_rec_buf if it is
+allocated and resetting the other members to their initial values. */
+void
+btr_pcur_reset(
+/*===========*/
+	btr_pcur_t*	cursor);/*!< in, out: persistent cursor */
+
+/**************************************************************//**
+Copies the stored position of a pcur to another pcur. */
+void
+btr_pcur_copy_stored_position(
+/*==========================*/
+	btr_pcur_t*	pcur_receive,	/*!< in: pcur which will receive the
+					position info */
+	btr_pcur_t*	pcur_donate);	/*!< in: pcur from which the info is
+					copied */
+/**************************************************************//**
+Sets the old_rec_buf field to NULL. */
+UNIV_INLINE
+void
+btr_pcur_init(
+/*==========*/
+	btr_pcur_t*	pcur);	/*!< in: persistent cursor */
+
+/** Opens an persistent cursor to an index tree without initializing the
+cursor.
+@param tuple      tuple on which search done
+@param mode       PAGE_CUR_L, ...; NOTE that if the search is made using a
+                  unique prefix of a record, mode should be PAGE_CUR_LE, not
+                  PAGE_CUR_GE, as the latter may end up on the previous page of
+                  the record!
+@param latch_mode BTR_SEARCH_LEAF, ...
+@param cursor     memory buffer for persistent cursor
+@param mtr        mini-transaction
+@return DB_SUCCESS on success or error code otherwise. */
+inline
+dberr_t btr_pcur_open_with_no_init(const dtuple_t *tuple, page_cur_mode_t mode,
+                                   btr_latch_mode latch_mode,
+                                   btr_pcur_t *cursor, mtr_t *mtr);
+
+/**************************************************************//**
+Gets the up_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_GE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_up_match(
+/*==================*/
+	const btr_pcur_t*	cursor); /*!< in: persistent cursor */
+/**************************************************************//**
+Gets the low_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_LE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_low_match(
+/*===================*/
+	const btr_pcur_t*	cursor); /*!< in: persistent cursor */
+
+/**************************************************************//**
+Frees the possible memory heap of a persistent cursor and sets the latch
+mode of the persistent cursor to BTR_NO_LATCHES.
+WARNING: this function does not release the latch on the page where the
+cursor is currently positioned. The latch is acquired by the
+"move to next/previous" family of functions. Since recursive shared locks
+are not allowed, you must take care (if using the cursor in S-mode) to
+manually release the latch by either calling
+btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr)
+or by mtr_t::commit(). */
+UNIV_INLINE
+void
+btr_pcur_close(
+/*===========*/
+	btr_pcur_t*	cursor);	/*!< in: persistent cursor */
+/**************************************************************//**
+The position of the cursor is stored by taking an initial segment of the
+record the cursor is positioned on, before, or after, and copying it to the
+cursor data structure, or just setting a flag if the cursor id before the
+first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the
+page where the cursor is positioned must not be empty if the index tree is
+not totally empty! */
+void
+btr_pcur_store_position(
+/*====================*/
+	btr_pcur_t*	cursor, /*!< in: persistent cursor */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************//**
+Gets the rel_pos field for a cursor whose position has been stored.
+@return BTR_PCUR_ON, ... */
+UNIV_INLINE
+ulint
+btr_pcur_get_rel_pos(
+/*=================*/
+	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
+/**************************************************************//**
+Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used before calling this,
+if restoration of cursor is wanted later. */
+UNIV_INLINE
+void
+btr_pcur_commit_specify_mtr(
+/*========================*/
+	btr_pcur_t*	pcur,	/*!< in: persistent cursor */
+	mtr_t*		mtr);	/*!< in: mtr to commit */
+
+/** Commits the mtr and sets the clustered index pcur and secondary index
+pcur latch mode to BTR_NO_LATCHES, that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used for both cursor before
+calling this, if restoration of cursor is wanted later.
+@param[in]	pcur		persistent cursor
+@param[in]	sec_pcur	secondary index persistent cursor
+@param[in]	mtr		mtr to commit */
+UNIV_INLINE
+void
+btr_pcurs_commit_specify_mtr(
+	btr_pcur_t*	pcur,
+	btr_pcur_t*	sec_pcur,
+	mtr_t*		mtr);
+
+/*********************************************************//**
+Moves the persistent cursor to the next record in the tree. If no records are
+left, the cursor stays 'after last in tree'.
+@return TRUE if the cursor was not after last in tree */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next(
+/*==================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the previous record in the tree. If no records
+are left, the cursor stays 'before first in tree'.
+@return true if the cursor was not before first in tree */
+bool
+btr_pcur_move_to_prev(
+/*==================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************//**
+Moves the persistent cursor to the next user record in the tree. If no user
+records are left, the cursor ends up 'after last in tree'.
+@return TRUE if the cursor moved forward, ending on a user record */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next_user_rec(
+/*===========================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the first record on the next page.
+Releases the latch on the current page, and bufferunfixes it.
+Note that there must not be modifications on the current page,
+as then the x-latch can be released only in mtr_commit. */
+dberr_t
+btr_pcur_move_to_next_page(
+/*=======================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; must be on the
+				last record of the current page */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#define btr_pcur_get_btr_cur(cursor) (&(cursor)->btr_cur)
+#define btr_pcur_get_page_cur(cursor) (&(cursor)->btr_cur.page_cur)
+#define btr_pcur_get_page(cursor) btr_pcur_get_block(cursor)->page.frame
+
+/*********************************************************//**
+Checks if the persistent cursor is on a user record. */
+UNIV_INLINE
+ibool
+btr_pcur_is_on_user_rec(
+/*====================*/
+	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_on_page(
+/*===========================*/
+	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_on_page(
+/*=============================*/
+	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record in
+the index tree. */
+static inline bool btr_pcur_is_before_first_in_tree(btr_pcur_t* cursor);
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record in
+the index tree. */
+static inline bool btr_pcur_is_after_last_in_tree(btr_pcur_t* cursor);
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/*********************************************************//**
+Moves the persistent cursor to the next record on the same page. */
+UNIV_INLINE
+rec_t*
+btr_pcur_move_to_next_on_page(
+/*==========================*/
+	btr_pcur_t*	cursor);/*!< in/out: persistent cursor */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/*********************************************************//**
+Moves the persistent cursor to the previous record on the same page. */
+UNIV_INLINE
+rec_t*
+btr_pcur_move_to_prev_on_page(
+/*==========================*/
+	btr_pcur_t*	cursor);/*!< in/out: persistent cursor */
+/*********************************************************//**
+Moves the persistent cursor to the infimum record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_before_first_on_page(
+/*===============================*/
+	btr_pcur_t*	cursor); /*!< in/out: persistent cursor */
+
+/** Position state of persistent B-tree cursor. */
+enum pcur_pos_t {
+	/** The persistent cursor is not positioned. */
+	BTR_PCUR_NOT_POSITIONED = 0,
+	/** The persistent cursor was previously positioned.
+	TODO: currently, the state can be BTR_PCUR_IS_POSITIONED,
+	though it really should be BTR_PCUR_WAS_POSITIONED,
+	because we have no obligation to commit the cursor with
+	mtr; similarly latch_mode may be out of date. This can
+	lead to problems if btr_pcur is not used the right way;
+	all current code should be ok. */
+	BTR_PCUR_WAS_POSITIONED,
+	/** The persistent cursor is positioned by optimistic get to the same
+	record as it was positioned at. Not used for rel_pos == BTR_PCUR_ON.
+	It may need adjustment depending on previous/current search direction
+	and rel_pos. */
+	BTR_PCUR_IS_POSITIONED_OPTIMISTIC,
+	/** The persistent cursor is positioned by index search.
+	Or optimistic get for rel_pos == BTR_PCUR_ON. */
+	BTR_PCUR_IS_POSITIONED
+};
+
+/* The persistent B-tree cursor structure. This is used mainly for SQL
+selects, updates, and deletes. */
+
+struct btr_pcur_t
+{
+  /** Return value of restore_position() */
+  enum restore_status {
+    /** cursor position on user rec and points on the record with
+    the same field values as in the stored record */
+    SAME_ALL,
+    /** cursor position is on user rec and points on the record with
+    the same unique field values as in the stored record */
+    SAME_UNIQ,
+    /** cursor position is not on user rec or points on the record
+    with not the same uniq field values as in the stored record */
+    NOT_SAME,
+    /** the index tree is corrupted */
+    CORRUPTED
+  };
+  /** a B-tree cursor */
+  btr_cur_t btr_cur;
+  /** @see BTR_PCUR_WAS_POSITIONED
+  BTR_SEARCH_LEAF, BTR_MODIFY_LEAF, BTR_MODIFY_TREE or BTR_NO_LATCHES,
+  depending on the latching state of the page and tree where the cursor
+  is positioned; BTR_NO_LATCHES means that the cursor is not currently
+  positioned:
+  we say then that the cursor is detached; it can be restored to
+  attached if the old position was stored in old_rec */
+  btr_latch_mode latch_mode= BTR_NO_LATCHES;
+  /** if cursor position is stored, contains an initial segment of the
+  latest record cursor was positioned either on, before or after */
+  rec_t *old_rec= nullptr;
+  /** btr_cur.index()->n_core_fields when old_rec was copied */
+  uint16 old_n_core_fields= 0;
+  /** number of fields in old_rec */
+  uint16 old_n_fields= 0;
+  /** BTR_PCUR_ON, BTR_PCUR_BEFORE, or BTR_PCUR_AFTER, depending on
+  whether cursor was on, before, or after the old_rec record */
+  btr_pcur_pos_t rel_pos= btr_pcur_pos_t(0);
+  /** buffer block when the position was stored */
+  buf::Block_hint block_when_stored;
+  /** the modify clock value of the buffer block when the cursor position
+  was stored */
+  ib_uint64_t modify_clock= 0;
+  /** btr_pcur_store_position() and restore_position() state. */
+  enum pcur_pos_t pos_state= BTR_PCUR_NOT_POSITIONED;
+  page_cur_mode_t search_mode= PAGE_CUR_UNSUPP;
+  /** the transaction, if we know it; otherwise this field is not defined;
+  can ONLY BE USED in error prints in fatal assertion failures! */
+  trx_t *trx_if_known= nullptr;
+  /** a dynamically allocated buffer for old_rec */
+  byte *old_rec_buf= nullptr;
+  /** old_rec_buf size if old_rec_buf is not NULL */
+  ulint buf_size= 0;
+
+  /** Return the index of this persistent cursor */
+  dict_index_t *index() const { return(btr_cur.index()); }
+  MY_ATTRIBUTE((nonnull, warn_unused_result))
+  /** Restores the stored position of a persistent cursor bufferfixing
+  the page and obtaining the specified latches. If the cursor position
+  was saved when the
+  (1) cursor was positioned on a user record: this function restores the
+  position to the last record LESS OR EQUAL to the stored record;
+  (2) cursor was positioned on a page infimum record: restores the
+  position to the last record LESS than the user record which was the
+  successor of the page infimum;
+  (3) cursor was positioned on the page supremum: restores to the first
+  record GREATER than the user record which was the predecessor of the
+  supremum.
+  (4) cursor was positioned before the first or after the last in an
+  empty tree: restores to before first or after the last in the tree.
+  @param latch_mode  BTR_SEARCH_LEAF, ...
+  @param mtr         mini-transaction
+  @retval SAME_ALL cursor position on user rec and points on
+  the record with the same field values as in the stored record,
+  @retval SAME_UNIQ cursor position is on user rec and points on the
+  record with the same unique field values as in the stored record,
+  @retval NOT_SAME cursor position is not on user rec or points on
+  the record with not the same uniq field values as in the stored
+  @retval CORRUPTED if the index is corrupted */
+  restore_status restore_position(btr_latch_mode latch_mode, mtr_t *mtr);
+
+  /** Open the cursor on the first or last record.
+  @param first         true=first record, false=last record
+  @param index         B-tree
+  @param latch_mode    which latches to acquire
+  @param mtr           mini-transaction
+  @return error code */
+  dberr_t open_leaf(bool first, dict_index_t *index, btr_latch_mode latch_mode,
+                    mtr_t *mtr)
+
+  {
+    this->latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+    search_mode= first ? PAGE_CUR_G : PAGE_CUR_L;
+    pos_state= BTR_PCUR_IS_POSITIONED;
+    old_rec= nullptr;
+
+    return btr_cur.open_leaf(first, index, this->latch_mode, mtr);
+  }
+};
+
+inline buf_block_t *btr_pcur_get_block(btr_pcur_t *cursor)
+{
+  ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+  return cursor->btr_cur.page_cur.block;
+}
+
+inline const buf_block_t *btr_pcur_get_block(const btr_pcur_t *cursor)
+{
+  ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+  return cursor->btr_cur.page_cur.block;
+}
+
+inline rec_t *btr_pcur_get_rec(const btr_pcur_t *cursor)
+{
+  ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+  ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+  return cursor->btr_cur.page_cur.rec;
+}
+
+/**************************************************************//**
+Initializes and opens a persistent cursor to an index tree. */
+inline
+dberr_t
+btr_pcur_open(
+	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
+        page_cur_mode_t	mode,	/*!< in: PAGE_CUR_LE, ... */
+	btr_latch_mode	latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_pcur_t*	cursor, /*!< in: memory buffer for persistent cursor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+  cursor->latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+  cursor->search_mode= mode;
+  cursor->pos_state= BTR_PCUR_IS_POSITIONED;
+  cursor->trx_if_known= nullptr;
+  return cursor->btr_cur.search_leaf(tuple, mode, latch_mode, mtr);
+}
+
+/** Open a cursor on the first user record satisfying the search condition;
+in case of no match, after the last index record. */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline
+dberr_t
+btr_pcur_open_on_user_rec(
+	const dtuple_t*	tuple,		/*!< in: tuple on which search done */
+	btr_latch_mode	latch_mode,	/*!< in: BTR_SEARCH_LEAF or
+					BTR_MODIFY_LEAF */
+	btr_pcur_t*	cursor,		/*!< in: memory buffer for persistent
+					cursor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+  ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF);
+  if (dberr_t err=
+      btr_pcur_open(tuple, PAGE_CUR_GE, latch_mode, cursor, mtr))
+    return err;
+  if (!btr_pcur_is_after_last_on_page(cursor) ||
+      btr_pcur_is_after_last_in_tree(cursor))
+    return DB_SUCCESS;
+  if (dberr_t err= btr_pcur_move_to_next_page(cursor, mtr))
+    return err;
+  return btr_pcur_move_to_next_on_page(cursor) ? DB_SUCCESS : DB_CORRUPTION;
+}
+
+#include "btr0pcur.inl"
diff --git a/storage/innobase/include/btr0pcur.inl b/storage/innobase/include/btr0pcur.inl
new file mode 100644
index 00000000..b827d70d
--- /dev/null
+++ b/storage/innobase/include/btr0pcur.inl
@@ -0,0 +1,372 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0pcur.ic
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+
+/*********************************************************//**
+Gets the rel_pos field for a cursor whose position has been stored.
+@return BTR_PCUR_ON, ... */
+UNIV_INLINE
+ulint
+btr_pcur_get_rel_pos(
+/*=================*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	ut_ad(cursor);
+	ut_ad(cursor->old_rec);
+	ut_ad(cursor->pos_state == BTR_PCUR_WAS_POSITIONED
+	      || cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+	return(cursor->rel_pos);
+}
+
+/**************************************************************//**
+Gets the up_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_GE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_up_match(
+/*==================*/
+	const btr_pcur_t*	cursor) /*!< in: persistent cursor */
+{
+	const btr_cur_t*	btr_cursor;
+
+	ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED)
+	      || (cursor->pos_state == BTR_PCUR_IS_POSITIONED));
+
+	btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+	ut_ad(btr_cursor->up_match != ULINT_UNDEFINED);
+
+	return(btr_cursor->up_match);
+}
+
+/**************************************************************//**
+Gets the low_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_LE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_low_match(
+/*===================*/
+	const btr_pcur_t*	cursor) /*!< in: persistent cursor */
+{
+	const btr_cur_t*	btr_cursor;
+
+	ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED)
+	      || (cursor->pos_state == BTR_PCUR_IS_POSITIONED));
+
+	btr_cursor = btr_pcur_get_btr_cur(cursor);
+	ut_ad(btr_cursor->low_match != ULINT_UNDEFINED);
+
+	return(btr_cursor->low_match);
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_on_page(
+/*===========================*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_on_page(
+/*=============================*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is on a user record. */
+UNIV_INLINE
+ibool
+btr_pcur_is_on_user_rec(
+/*====================*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+  return !btr_pcur_is_before_first_on_page(cursor) &&
+    !btr_pcur_is_after_last_on_page(cursor);
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record in
+the index tree. */
+static inline bool btr_pcur_is_before_first_in_tree(btr_pcur_t* cursor)
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	return !page_has_prev(btr_pcur_get_page(cursor))
+		&& page_cur_is_before_first(btr_pcur_get_page_cur(cursor));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record in
+the index tree. */
+static inline bool btr_pcur_is_after_last_in_tree(btr_pcur_t* cursor)
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	return !page_has_next(btr_pcur_get_page(cursor))
+		&& page_cur_is_after_last(btr_pcur_get_page_cur(cursor));
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next record on the same page. */
+UNIV_INLINE
+rec_t*
+btr_pcur_move_to_next_on_page(
+/*==========================*/
+	btr_pcur_t*	cursor)	/*!< in/out: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	cursor->old_rec = nullptr;
+	return page_cur_move_to_next(btr_pcur_get_page_cur(cursor));
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the previous record on the same page. */
+UNIV_INLINE
+rec_t*
+btr_pcur_move_to_prev_on_page(
+/*==========================*/
+	btr_pcur_t*	cursor)	/*!< in/out: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+	cursor->old_rec = nullptr;
+
+	return page_cur_move_to_prev(btr_pcur_get_page_cur(cursor));
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next user record in the tree. If no user
+records are left, the cursor ends up 'after last in tree'.
+@return TRUE if the cursor moved forward, ending on a user record */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next_user_rec(
+/*===========================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+	cursor->old_rec = nullptr;
+loop:
+	if (btr_pcur_is_after_last_on_page(cursor)) {
+		if (btr_pcur_is_after_last_in_tree(cursor)
+		    || btr_pcur_move_to_next_page(cursor, mtr) != DB_SUCCESS) {
+			return(FALSE);
+		}
+	} else if (UNIV_UNLIKELY(!btr_pcur_move_to_next_on_page(cursor))) {
+		return false;
+	}
+
+	if (btr_pcur_is_on_user_rec(cursor)) {
+
+		return(TRUE);
+	}
+
+	goto loop;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next record in the tree. If no records are
+left, the cursor stays 'after last in tree'.
+@return TRUE if the cursor was not after last in tree */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next(
+/*==================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+  ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+  ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+  cursor->old_rec= nullptr;
+
+  if (btr_pcur_is_after_last_on_page(cursor))
+    return !btr_pcur_is_after_last_in_tree(cursor) &&
+      btr_pcur_move_to_next_page(cursor, mtr) == DB_SUCCESS;
+  else
+    return !!btr_pcur_move_to_next_on_page(cursor);
+}
+
+/**************************************************************//**
+Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used before calling this,
+if restoration of cursor is wanted later. */
+UNIV_INLINE
+void
+btr_pcur_commit_specify_mtr(
+/*========================*/
+	btr_pcur_t*	pcur,	/*!< in: persistent cursor */
+	mtr_t*		mtr)	/*!< in: mtr to commit */
+{
+	ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+
+	pcur->latch_mode = BTR_NO_LATCHES;
+
+	mtr_commit(mtr);
+
+	pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+}
+
+/** Commits the mtr and sets the clustered index pcur and secondary index
+pcur latch mode to BTR_NO_LATCHES, that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used for both cursor before
+calling this, if restoration of cursor is wanted later.
+@param[in]	pcur		persistent cursor
+@param[in]	sec_pcur	secondary index persistent cursor
+@param[in]	mtr		mtr to commit */
+UNIV_INLINE
+void
+btr_pcurs_commit_specify_mtr(
+	btr_pcur_t*	pcur,
+	btr_pcur_t*	sec_pcur,
+	mtr_t*		mtr)
+{
+	ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(sec_pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+
+	pcur->latch_mode = BTR_NO_LATCHES;
+	sec_pcur->latch_mode = BTR_NO_LATCHES;
+
+	mtr_commit(mtr);
+
+	pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+	sec_pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+}
+
+/**************************************************************//**
+Sets the old_rec_buf field to NULL. */
+UNIV_INLINE
+void
+btr_pcur_init(
+/*==========*/
+	btr_pcur_t*	pcur)	/*!< in: persistent cursor */
+{
+	pcur->old_rec_buf = NULL;
+	pcur->old_rec = NULL;
+
+	pcur->btr_cur.rtr_info = NULL;
+}
+
+/** Opens an persistent cursor to an index tree without initializing the
+cursor.
+@param tuple      tuple on which search done
+@param mode       search mode; NOTE that if the search is made using a
+                  unique prefix of a record, mode should be PAGE_CUR_LE, not
+                  PAGE_CUR_GE, as the latter may end up on the previous page of
+                  the record!
+@param latch_mode BTR_SEARCH_LEAF, ...
+@param cursor     memory buffer for persistent cursor
+@param mtr        mini-transaction
+@return DB_SUCCESS on success or error code otherwise. */
+inline
+dberr_t btr_pcur_open_with_no_init(const dtuple_t *tuple, page_cur_mode_t mode,
+                                   btr_latch_mode latch_mode,
+                                   btr_pcur_t *cursor, mtr_t *mtr)
+{
+  cursor->latch_mode= BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode);
+  cursor->search_mode= mode;
+  cursor->pos_state= BTR_PCUR_IS_POSITIONED;
+  cursor->trx_if_known= nullptr;
+  return cursor->btr_cur.search_leaf(tuple, mode, latch_mode, mtr);
+}
+
+/**************************************************************//**
+Frees the possible memory heap of a persistent cursor and sets the latch
+mode of the persistent cursor to BTR_NO_LATCHES.
+WARNING: this function does not release the latch on the page where the
+cursor is currently positioned. The latch is acquired by the
+"move to next/previous" family of functions. Since recursive shared locks
+are not allowed, you must take care (if using the cursor in S-mode) to
+manually release the latch by either calling
+btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr)
+or by mtr_t::commit(). */
+UNIV_INLINE
+void
+btr_pcur_close(
+/*===========*/
+	btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+  ut_free(cursor->old_rec_buf);
+
+  if (cursor->btr_cur.rtr_info)
+    rtr_clean_rtr_info(cursor->btr_cur.rtr_info, true);
+
+  cursor->btr_cur.rtr_info= nullptr;
+  cursor->old_rec = nullptr;
+  cursor->old_rec_buf = nullptr;
+  cursor->btr_cur.page_cur.rec = nullptr;
+  cursor->btr_cur.page_cur.block = nullptr;
+
+  cursor->latch_mode = BTR_NO_LATCHES;
+  cursor->pos_state = BTR_PCUR_NOT_POSITIONED;
+
+  cursor->trx_if_known = nullptr;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the infimum record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_before_first_on_page(
+/*===============================*/
+	btr_pcur_t*	cursor) /*!< in/out: persistent cursor */
+{
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	page_cur_set_before_first(btr_pcur_get_block(cursor),
+		btr_pcur_get_page_cur(cursor));
+
+	cursor->old_rec = nullptr;
+}
diff --git a/storage/innobase/include/btr0sea.h b/storage/innobase/include/btr0sea.h
new file mode 100644
index 00000000..b75cad10
--- /dev/null
+++ b/storage/innobase/include/btr0sea.h
@@ -0,0 +1,403 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0sea.h
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#ifndef btr0sea_h
+#define btr0sea_h
+
+#include "dict0dict.h"
+#ifdef BTR_CUR_HASH_ADAPT
+#include "ha0ha.h"
+#include "srw_lock.h"
+
+#ifdef UNIV_PFS_RWLOCK
+extern mysql_pfs_key_t btr_search_latch_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+#define btr_search_sys_create() btr_search_sys.create()
+#define btr_search_sys_free() btr_search_sys.free()
+
+/** Disable the adaptive hash search system and empty the index. */
+void btr_search_disable();
+
+/** Enable the adaptive hash search system.
+@param resize whether buf_pool_t::resize() is the caller */
+void btr_search_enable(bool resize= false);
+
+/*********************************************************************//**
+Updates the search info. */
+UNIV_INLINE
+void
+btr_search_info_update(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index of the cursor */
+	btr_cur_t*	cursor);/*!< in: cursor which was just positioned */
+
+/** Tries to guess the right search position based on the hash search info
+of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts,
+and the function returns TRUE, then cursor->up_match and cursor->low_match
+both have sensible values.
+@param[in,out]	index		index
+@param[in,out]	info		index search info
+@param[in]	tuple		logical record
+@param[in]	mode		PAGE_CUR_L, ....
+@param[in]	latch_mode	BTR_SEARCH_LEAF, ...
+@param[out]	cursor		tree cursor
+@param[in]	mtr		mini-transaction
+@return whether the search succeeded */
+bool
+btr_search_guess_on_hash(
+	dict_index_t*	index,
+	btr_search_t*	info,
+	const dtuple_t*	tuple,
+	ulint		mode,
+	ulint		latch_mode,
+	btr_cur_t*	cursor,
+	mtr_t*		mtr);
+
+/** Move or delete hash entries for moved records, usually in a page split.
+If new_block is already hashed, then any hash index for block is dropped.
+If new_block is not hashed, and block is hashed, then a new hash index is
+built to new_block with the same parameters as block.
+@param[in,out]	new_block	destination page
+@param[in,out]	block		source page (subject to deletion later) */
+void
+btr_search_move_or_delete_hash_entries(
+	buf_block_t*	new_block,
+	buf_block_t*	block);
+
+/** Drop any adaptive hash index entries that point to an index page.
+@param[in,out]	block	block containing index page, s- or x-latched, or an
+			index page for which we know that
+			block->buf_fix_count == 0 or it is an index page which
+			has already been removed from the buf_pool.page_hash
+			i.e.: it is in state BUF_BLOCK_REMOVE_HASH
+@param[in]	garbage_collect	drop ahi only if the index is marked
+				as freed */
+void btr_search_drop_page_hash_index(buf_block_t* block,
+				     bool garbage_collect);
+
+/** Drop possible adaptive hash index entries when a page is evicted
+from the buffer pool or freed in a file, or the index is being dropped.
+@param[in]	page_id		page id */
+void btr_search_drop_page_hash_when_freed(const page_id_t page_id);
+
+/** Updates the page hash index when a single record is inserted on a page.
+@param[in]	cursor	cursor which was positioned to the place to insert
+			using btr_cur_search_, and the new record has been
+			inserted next to the cursor.
+@param[in]	ahi_latch	the adaptive hash index latch */
+void btr_search_update_hash_node_on_insert(btr_cur_t *cursor,
+                                           srw_spin_lock *ahi_latch);
+
+/** Updates the page hash index when a single record is inserted on a page.
+@param[in,out]	cursor		cursor which was positioned to the
+				place to insert using btr_cur_search_...,
+				and the new record has been inserted next
+				to the cursor
+@param[in]	ahi_latch	the adaptive hash index latch */
+void btr_search_update_hash_on_insert(btr_cur_t *cursor,
+                                      srw_spin_lock *ahi_latch);
+
+/** Updates the page hash index when a single record is deleted from a page.
+@param[in]	cursor	cursor which was positioned on the record to delete
+			using btr_cur_search_, the record is not yet deleted.*/
+void btr_search_update_hash_on_delete(btr_cur_t *cursor);
+
+/** Validates the search system.
+@param thd   connection, for checking if CHECK TABLE has been killed
+@return true if ok */
+bool btr_search_validate(THD *thd);
+
+/** Lock all search latches in exclusive mode. */
+static inline void btr_search_x_lock_all();
+
+/** Unlock all search latches from exclusive mode. */
+static inline void btr_search_x_unlock_all();
+
+/** Lock all search latches in shared mode. */
+static inline void btr_search_s_lock_all();
+
+/** Unlock all search latches from shared mode. */
+static inline void btr_search_s_unlock_all();
+
+# ifdef UNIV_DEBUG
+/** @return if the index is marked as freed */
+bool btr_search_check_marked_free_index(const buf_block_t *block);
+# endif /* UNIV_DEBUG */
+#else /* BTR_CUR_HASH_ADAPT */
+# define btr_search_sys_create()
+# define btr_search_sys_free()
+# define btr_search_drop_page_hash_index(block, garbage_collect)
+# define btr_search_s_lock_all(index)
+# define btr_search_s_unlock_all(index)
+# define btr_search_info_update(index, cursor)
+# define btr_search_move_or_delete_hash_entries(new_block, block)
+# define btr_search_update_hash_on_insert(cursor, ahi_latch)
+# define btr_search_update_hash_on_delete(cursor)
+# ifdef UNIV_DEBUG
+#  define btr_search_check_marked_free_index(block)
+# endif /* UNIV_DEBUG */
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#ifdef BTR_CUR_ADAPT
+/** Create and initialize search info.
+@param[in,out]	heap		heap where created
+@return own: search info struct */
+static inline btr_search_t* btr_search_info_create(mem_heap_t* heap)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** @return the search info of an index */
+static inline btr_search_t* btr_search_get_info(dict_index_t* index)
+{
+	return(index->search_info);
+}
+#endif /* BTR_CUR_ADAPT */
+
+/** The search info struct in an index */
+struct btr_search_t{
+	/* @{ The following fields are not protected by any latch.
+	Unfortunately, this means that they must be aligned to
+	the machine word, i.e., they cannot be turned into bit-fields. */
+	buf_block_t* root_guess;/*!< the root page frame when it was last time
+				fetched, or NULL */
+#ifdef BTR_CUR_HASH_ADAPT
+	ulint	hash_analysis;	/*!< when this exceeds
+				BTR_SEARCH_HASH_ANALYSIS, the hash
+				analysis starts; this is reset if no
+				success noticed */
+	ibool	last_hash_succ;	/*!< TRUE if the last search would have
+				succeeded, or did succeed, using the hash
+				index; NOTE that the value here is not exact:
+				it is not calculated for every search, and the
+				calculation itself is not always accurate! */
+	ulint	n_hash_potential;
+				/*!< number of consecutive searches
+				which would have succeeded, or did succeed,
+				using the hash index;
+				the range is 0 .. BTR_SEARCH_BUILD_LIMIT + 5 */
+	/* @} */
+	ulint	ref_count;	/*!< Number of blocks in this index tree
+				that have search index built
+				i.e. block->index points to this index.
+				Protected by search latch except
+				when during initialization in
+				btr_search_info_create(). */
+
+	/*---------------------- @{ */
+	uint16_t n_fields;	/*!< recommended prefix length for hash search:
+				number of full fields */
+	uint16_t n_bytes;	/*!< recommended prefix: number of bytes in
+				an incomplete field
+				@see BTR_PAGE_MAX_REC_SIZE */
+	bool	left_side;	/*!< true or false, depending on whether
+				the leftmost record of several records with
+				the same prefix should be indexed in the
+				hash index */
+	/*---------------------- @} */
+#ifdef UNIV_SEARCH_PERF_STAT
+	ulint	n_hash_succ;	/*!< number of successful hash searches thus
+				far */
+	ulint	n_hash_fail;	/*!< number of failed hash searches */
+	ulint	n_patt_succ;	/*!< number of successful pattern searches thus
+				far */
+	ulint	n_searches;	/*!< number of searches */
+#endif /* UNIV_SEARCH_PERF_STAT */
+#endif /* BTR_CUR_HASH_ADAPT */
+#ifdef UNIV_DEBUG
+	ulint	magic_n;	/*!< magic number @see BTR_SEARCH_MAGIC_N */
+/** value of btr_search_t::magic_n, used in assertions */
+# define BTR_SEARCH_MAGIC_N	1112765
+#endif /* UNIV_DEBUG */
+};
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** The hash index system */
+struct btr_search_sys_t
+{
+  /** Partition of the hash table */
+  struct partition
+  {
+    /** latches protecting hash_table */
+    srw_spin_lock latch;
+    /** mapping of dtuple_fold() to rec_t* in buf_block_t::frame */
+    hash_table_t table;
+    /** memory heap for table */
+    mem_heap_t *heap;
+
+#ifdef _MSC_VER
+#pragma warning(push)
+// nonstandard extension - zero sized array, if perfschema is not compiled
+#pragma warning(disable : 4200)
+#endif
+
+    char pad[(CPU_LEVEL1_DCACHE_LINESIZE - sizeof latch -
+              sizeof table - sizeof heap) &
+             (CPU_LEVEL1_DCACHE_LINESIZE - 1)];
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+    void init()
+    {
+      memset((void*) this, 0, sizeof *this);
+      latch.SRW_LOCK_INIT(btr_search_latch_key);
+    }
+
+    void alloc(ulint hash_size)
+    {
+      table.create(hash_size);
+      heap= mem_heap_create_typed(std::min<ulong>(4096,
+                                                  MEM_MAX_ALLOC_IN_BUF / 2
+                                                  - MEM_BLOCK_HEADER_SIZE
+                                                  - MEM_SPACE_NEEDED(0)),
+                                  MEM_HEAP_FOR_BTR_SEARCH);
+    }
+
+    void clear()
+    {
+      mem_heap_free(heap);
+      heap= nullptr;
+      ut_free(table.array);
+    }
+
+    void free()
+    {
+      latch.destroy();
+      if (heap)
+        clear();
+    }
+  };
+
+  /** Partitions of the adaptive hash index */
+  partition *parts;
+
+  /** Get an adaptive hash index partition */
+  partition *get_part(index_id_t id, ulint space_id) const
+  {
+    return parts + ut_fold_ulint_pair(ulint(id), space_id) % btr_ahi_parts;
+  }
+
+  /** Get an adaptive hash index partition */
+  partition *get_part(const dict_index_t &index) const
+  {
+    ut_ad(!index.table->space ||
+          index.table->space->id == index.table->space_id);
+    return get_part(ulint(index.id), index.table->space_id);
+  }
+
+  /** Get the search latch for the adaptive hash index partition */
+  srw_spin_lock *get_latch(const dict_index_t &index) const
+  { return &get_part(index)->latch; }
+
+  /** Create and initialize at startup */
+  void create()
+  {
+    parts= static_cast<partition*>(ut_malloc(btr_ahi_parts * sizeof *parts,
+                                             mem_key_ahi));
+    for (ulong i= 0; i < btr_ahi_parts; ++i)
+      parts[i].init();
+    if (btr_search_enabled)
+      btr_search_enable();
+  }
+
+  void alloc(ulint hash_size)
+  {
+    hash_size/= btr_ahi_parts;
+    for (ulong i= 0; i < btr_ahi_parts; ++i)
+      parts[i].alloc(hash_size);
+  }
+
+  /** Clear when disabling the adaptive hash index */
+  void clear() { for (ulong i= 0; i < btr_ahi_parts; ++i) parts[i].clear(); }
+
+  /** Free at shutdown */
+  void free()
+  {
+    if (parts)
+    {
+      for (ulong i= 0; i < btr_ahi_parts; ++i)
+        parts[i].free();
+      ut_free(parts);
+      parts= nullptr;
+    }
+  }
+};
+
+/** The adaptive hash index */
+extern btr_search_sys_t btr_search_sys;
+
+/** @return number of leaf pages pointed to by the adaptive hash index */
+TRANSACTIONAL_INLINE inline ulint dict_index_t::n_ahi_pages() const
+{
+  if (!btr_search_enabled)
+    return 0;
+  srw_spin_lock *latch= &btr_search_sys.get_part(*this)->latch;
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  if (xbegin())
+  {
+    if (latch->is_locked())
+      xabort();
+    ulint ref_count= search_info->ref_count;
+    xend();
+    return ref_count;
+  }
+#endif
+  latch->rd_lock(SRW_LOCK_CALL);
+  ulint ref_count= search_info->ref_count;
+  latch->rd_unlock();
+  return ref_count;
+}
+
+#ifdef UNIV_SEARCH_PERF_STAT
+/** Number of successful adaptive hash index lookups */
+extern ulint	btr_search_n_succ;
+/** Number of failed adaptive hash index lookups */
+extern ulint	btr_search_n_hash_fail;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+/** After change in n_fields or n_bytes in info, this many rounds are waited
+before starting the hash analysis again: this is to save CPU time when there
+is no hope in building a hash index. */
+#define BTR_SEARCH_HASH_ANALYSIS	17
+
+/** Limit of consecutive searches for trying a search shortcut on the search
+pattern */
+#define BTR_SEARCH_ON_PATTERN_LIMIT	3
+
+/** Limit of consecutive searches for trying a search shortcut using
+the hash index */
+#define BTR_SEARCH_ON_HASH_LIMIT	3
+
+/** We do this many searches before trying to keep the search latch
+over calls from MySQL. If we notice someone waiting for the latch, we
+again set this much timeout. This is to reduce contention. */
+#define BTR_SEA_TIMEOUT			10000
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#include "btr0sea.inl"
+
+#endif
diff --git a/storage/innobase/include/btr0sea.inl b/storage/innobase/include/btr0sea.inl
new file mode 100644
index 00000000..5a8d6480
--- /dev/null
+++ b/storage/innobase/include/btr0sea.inl
@@ -0,0 +1,117 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0sea.ic
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "dict0mem.h"
+#include "btr0cur.h"
+#include "buf0buf.h"
+
+/** Create and initialize search info.
+@param[in,out]	heap		heap where created
+@return own: search info struct */
+static inline btr_search_t* btr_search_info_create(mem_heap_t* heap)
+{
+	btr_search_t*	info = static_cast<btr_search_t*>(
+		mem_heap_zalloc(heap, sizeof(btr_search_t)));
+	ut_d(info->magic_n = BTR_SEARCH_MAGIC_N);
+#ifdef BTR_CUR_HASH_ADAPT
+	info->n_fields = 1;
+	info->left_side = TRUE;
+#endif /* BTR_CUR_HASH_ADAPT */
+	return(info);
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** Updates the search info.
+@param[in,out]	info	search info
+@param[in,out]	cursor	cursor which was just positioned */
+void btr_search_info_update_slow(btr_search_t *info, btr_cur_t *cursor);
+
+/*********************************************************************//**
+Updates the search info. */
+static inline
+void
+btr_search_info_update(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index of the cursor */
+	btr_cur_t*	cursor)	/*!< in: cursor which was just positioned */
+{
+	ut_ad(!index->is_spatial());
+	ut_ad(!index->table->is_temporary());
+
+	if (!btr_search_enabled) {
+		return;
+	}
+
+	btr_search_t*	info;
+	info = btr_search_get_info(index);
+
+	info->hash_analysis++;
+
+	if (info->hash_analysis < BTR_SEARCH_HASH_ANALYSIS) {
+
+		/* Do nothing */
+
+		return;
+
+	}
+
+	ut_ad(cursor->flag != BTR_CUR_HASH);
+
+	btr_search_info_update_slow(info, cursor);
+}
+
+/** Lock all search latches in exclusive mode. */
+static inline void btr_search_x_lock_all()
+{
+	for (ulint i = 0; i < btr_ahi_parts; ++i) {
+		btr_search_sys.parts[i].latch.wr_lock(SRW_LOCK_CALL);
+	}
+}
+
+/** Unlock all search latches from exclusive mode. */
+static inline void btr_search_x_unlock_all()
+{
+	for (ulint i = 0; i < btr_ahi_parts; ++i) {
+		btr_search_sys.parts[i].latch.wr_unlock();
+	}
+}
+
+/** Lock all search latches in shared mode. */
+static inline void btr_search_s_lock_all()
+{
+	for (ulint i = 0; i < btr_ahi_parts; ++i) {
+		btr_search_sys.parts[i].latch.rd_lock(SRW_LOCK_CALL);
+	}
+}
+
+/** Unlock all search latches from shared mode. */
+static inline void btr_search_s_unlock_all()
+{
+	for (ulint i = 0; i < btr_ahi_parts; ++i) {
+		btr_search_sys.parts[i].latch.rd_unlock();
+	}
+}
+#endif /* BTR_CUR_HASH_ADAPT */
diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h
new file mode 100644
index 00000000..fc829e78
--- /dev/null
+++ b/storage/innobase/include/btr0types.h
@@ -0,0 +1,154 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0types.h
+The index tree general types
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#pragma once
+
+#include "page0types.h"
+#include "rem0types.h"
+
+/** Persistent cursor */
+struct btr_pcur_t;
+/** B-tree cursor */
+struct btr_cur_t;
+/** B-tree search information for the adaptive hash index */
+struct btr_search_t;
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** Is search system enabled.
+Search system is protected by array of latches. */
+extern char	btr_search_enabled;
+
+/** Number of adaptive hash index partition. */
+extern ulong	btr_ahi_parts;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/** The size of a reference to data stored on a different page.
+The reference is stored at the end of the prefix of the field
+in the index record. */
+#define FIELD_REF_SIZE			20U
+#define BTR_EXTERN_FIELD_REF_SIZE	FIELD_REF_SIZE
+
+/** If the data don't exceed the size, the data are stored locally. */
+#define BTR_EXTERN_LOCAL_STORED_MAX_SIZE	\
+	(BTR_EXTERN_FIELD_REF_SIZE * 2)
+
+/** Latching modes for btr_cur_t::search_leaf(). */
+enum btr_latch_mode {
+	/** Search a record on a leaf page and S-latch it. */
+	BTR_SEARCH_LEAF = RW_S_LATCH,
+	/** (Prepare to) modify a record on a leaf page and X-latch it. */
+	BTR_MODIFY_LEAF	= RW_X_LATCH,
+	/** U-latch root and X-latch a leaf page */
+	BTR_MODIFY_ROOT_AND_LEAF = RW_SX_LATCH,
+	/** Obtain no latches. */
+	BTR_NO_LATCHES = RW_NO_LATCH,
+	/** Search the previous record.
+	Used in btr_pcur_move_backward_from_page(). */
+	BTR_SEARCH_PREV = 4 | BTR_SEARCH_LEAF,
+	/** Modify the previous record.
+	Used in btr_pcur_move_backward_from_page() and ibuf_insert(). */
+	BTR_MODIFY_PREV = 4 | BTR_MODIFY_LEAF,
+	/** Start modifying the entire B-tree. */
+	BTR_MODIFY_TREE = 8 | BTR_MODIFY_LEAF,
+	/** Continue modifying the entire R-tree.
+	Only used by rtr_search_to_nth_level(). */
+	BTR_CONT_MODIFY_TREE = 4 | BTR_MODIFY_TREE,
+
+	/* BTR_INSERT, BTR_DELETE and BTR_DELETE_MARK are mutually
+	exclusive. */
+	/** The search tuple will be inserted to the secondary index
+	at the searched position.  When the leaf page is not in the
+	buffer pool, try to use the change buffer. */
+	BTR_INSERT = 64,
+
+	/** Try to delete mark a secondary index leaf page record at
+	the searched position using the change buffer when the page is
+	not in the buffer pool. */
+	BTR_DELETE_MARK	= 128,
+
+	/** Try to purge the record using the change buffer when the
+	secondary index leaf page is not in the buffer pool. */
+	BTR_DELETE = BTR_INSERT | BTR_DELETE_MARK,
+
+	/** The caller is already holding dict_index_t::lock S-latch. */
+	BTR_ALREADY_S_LATCHED = 256,
+	/** Search and S-latch a leaf page, assuming that the
+	dict_index_t::lock S-latch is being held. */
+	BTR_SEARCH_LEAF_ALREADY_S_LATCHED = BTR_SEARCH_LEAF
+	| BTR_ALREADY_S_LATCHED,
+	/** Search and X-latch a leaf page, assuming that the
+	dict_index_t::lock is being held in non-exclusive mode. */
+	BTR_MODIFY_LEAF_ALREADY_LATCHED = BTR_MODIFY_LEAF
+	| BTR_ALREADY_S_LATCHED,
+	/** Attempt to modify records in an x-latched tree. */
+	BTR_MODIFY_TREE_ALREADY_LATCHED = BTR_MODIFY_TREE
+	| BTR_ALREADY_S_LATCHED,
+	/** U-latch root and X-latch a leaf page, assuming that
+	dict_index_t::lock is being held in U mode. */
+	BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED = BTR_MODIFY_ROOT_AND_LEAF
+	| BTR_ALREADY_S_LATCHED,
+
+	/** Attempt to delete-mark a secondary index record. */
+	BTR_DELETE_MARK_LEAF = BTR_MODIFY_LEAF | BTR_DELETE_MARK,
+	/** Attempt to delete-mark a secondary index record
+	while holding the dict_index_t::lock S-latch. */
+	BTR_DELETE_MARK_LEAF_ALREADY_S_LATCHED = BTR_DELETE_MARK_LEAF
+	| BTR_ALREADY_S_LATCHED,
+	/** Attempt to purge a secondary index record. */
+	BTR_PURGE_LEAF = BTR_MODIFY_LEAF | BTR_DELETE,
+	/** Attempt to purge a secondary index record
+	while holding the dict_index_t::lock S-latch. */
+	BTR_PURGE_LEAF_ALREADY_S_LATCHED = BTR_PURGE_LEAF
+	| BTR_ALREADY_S_LATCHED,
+
+	/** In the case of BTR_MODIFY_TREE, the caller specifies
+	the intention to delete record only. It is used to optimize
+	block->lock range.*/
+	BTR_LATCH_FOR_DELETE = 512,
+
+	/** In the case of BTR_MODIFY_TREE, the caller specifies
+	the intention to delete record only. It is used to optimize
+	block->lock range.*/
+	BTR_LATCH_FOR_INSERT = 1024,
+
+	/** Attempt to delete a record in the tree. */
+	BTR_PURGE_TREE = BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
+	/** Attempt to delete a record in an x-latched tree. */
+	BTR_PURGE_TREE_ALREADY_LATCHED = BTR_PURGE_TREE
+	| BTR_ALREADY_S_LATCHED,
+
+	/** Attempt to insert a record into the tree. */
+	BTR_INSERT_TREE = BTR_MODIFY_TREE | BTR_LATCH_FOR_INSERT,
+
+	/** This flag ORed to BTR_INSERT says that we can ignore possible
+	UNIQUE definition on secondary indexes when we decide if we can use
+	the insert buffer to speed up inserts */
+	BTR_IGNORE_SEC_UNIQUE = 2048,
+	/** Rollback in spatial index */
+	BTR_RTREE_UNDO_INS = 4096,
+	/** Try to delete mark a spatial index record */
+	BTR_RTREE_DELETE_MARK = 8192
+};
diff --git a/storage/innobase/include/buf0block_hint.h b/storage/innobase/include/buf0block_hint.h
new file mode 100644
index 00000000..d4fee7c1
--- /dev/null
+++ b/storage/innobase/include/buf0block_hint.h
@@ -0,0 +1,76 @@
+/*****************************************************************************
+
+Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License, version 2.0, as published by the
+Free Software Foundation.
+
+This program is also distributed with certain software (including but not
+limited to OpenSSL) that is licensed under separate terms, as designated in a
+particular file or component or in included license documentation. The authors
+of MySQL hereby grant you an additional permission to link the program and
+your derivative works with the separately licensed software that they have
+included with MySQL.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
+for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA
+
+*****************************************************************************/
+#pragma once
+#include "buf0buf.h"
+
+namespace buf {
+class Block_hint {
+public:
+  /** Stores the pointer to the block, which is currently buffer-fixed.
+  @param  block   a pointer to a buffer-fixed block to be stored */
+  inline void store(buf_block_t *block)
+  {
+    ut_ad(block->page.buf_fix_count());
+    m_block= block;
+    m_page_id= block->page.id();
+  }
+
+  /** Clears currently stored pointer. */
+  inline void clear() { m_block= nullptr; }
+
+  /** Invoke f on m_block(which may be null)
+  @param  f   The function to be executed. It will be passed the pointer.
+              If you wish to use the block pointer subsequently,
+	      you need to ensure you buffer-fix it before returning from f.
+  @return the return value of f
+  */
+  template <typename F>
+  bool run_with_hint(const F &f)
+  {
+    buffer_fix_block_if_still_valid();
+    /* m_block could be changed during f() call, so we use local
+    variable to remember which block we need to unfix */
+    buf_block_t *block= m_block;
+    bool res= f(block);
+    if (block)
+      block->page.unfix();
+    return res;
+  }
+
+  buf_block_t *block() const { return m_block; }
+
+ private:
+  /** The block pointer stored by store(). */
+  buf_block_t *m_block= nullptr;
+  /** If m_block is non-null, the m_block->page.id at time it was stored. */
+  page_id_t m_page_id{0, 0};
+
+  /** A helper function which checks if m_block is not a dangling pointer and
+  still points to block with page with m_page_id and if so, buffer-fixes it,
+  otherwise clear()s it */
+  void buffer_fix_block_if_still_valid();
+};
+}  // namespace buf
diff --git a/storage/innobase/include/buf0buddy.h b/storage/innobase/include/buf0buddy.h
new file mode 100644
index 00000000..bb999420
--- /dev/null
+++ b/storage/innobase/include/buf0buddy.h
@@ -0,0 +1,91 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buddy.h
+Binary buddy allocator for compressed pages
+
+Created December 2006 by Marko Makela
+*******************************************************/
+
+#ifndef buf0buddy_h
+#define buf0buddy_h
+
+#include "buf0types.h"
+
+/**
+@param[in]	block size in bytes
+@return index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */
+inline
+ulint
+buf_buddy_get_slot(ulint size)
+{
+	ulint	i;
+	ulint	s;
+
+	ut_ad(ut_is_2pow(size));
+	ut_ad(size >= UNIV_ZIP_SIZE_MIN);
+	ut_ad(size <= srv_page_size);
+
+	for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) {
+	}
+	ut_ad(i <= BUF_BUDDY_SIZES);
+	return i;
+}
+
+/** Allocate a ROW_FORMAT=COMPRESSED block.
+@param i      index of buf_pool.zip_free[] or BUF_BUDDY_SIZES
+@param lru    assigned to true if buf_pool.mutex was temporarily released
+@return allocated block, never NULL */
+byte *buf_buddy_alloc_low(ulint i, bool *lru) MY_ATTRIBUTE((malloc));
+
+/** Allocate a ROW_FORMAT=COMPRESSED block.
+@param size   compressed page size in bytes
+@param lru    assigned to true if buf_pool.mutex was temporarily released
+@return allocated block, never NULL */
+inline byte *buf_buddy_alloc(ulint size, bool *lru= nullptr)
+{
+  return buf_buddy_alloc_low(buf_buddy_get_slot(size), lru);
+}
+
+/** Deallocate a block.
+@param[in]	buf	block to be freed, must not be pointed to
+			by the buffer pool
+@param[in]	i	index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */
+void buf_buddy_free_low(void* buf, ulint i);
+
+/** Deallocate a block.
+@param[in]	buf	block to be freed, must not be pointed to
+			by the buffer pool
+@param[in]	size	block size in bytes */
+inline void buf_buddy_free(void* buf, ulint size)
+{
+	buf_buddy_free_low(buf, buf_buddy_get_slot(size));
+}
+
+/** Try to reallocate a block.
+@param[in]	buf		block to be reallocated, must be pointed
+to by the buffer pool
+@param[in]	size		block size, up to srv_page_size
+@retval false	if failed because of no free blocks. */
+bool buf_buddy_realloc(void* buf, ulint size);
+
+/** Combine all pairs of free buddies. */
+void buf_buddy_condense_free();
+#endif /* buf0buddy_h */
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
new file mode 100644
index 00000000..332b2039
--- /dev/null
+++ b/storage/innobase/include/buf0buf.h
@@ -0,0 +1,2190 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buf.h
+The database buffer pool high-level routines
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+/** Magic value to use instead of checksums when they are disabled */
+#define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL
+
+#include "fil0fil.h"
+#include "mtr0types.h"
+#include "span.h"
+#include "assume_aligned.h"
+#include "buf0types.h"
+#ifndef UNIV_INNOCHECKSUM
+#include "ut0byte.h"
+#include "page0types.h"
+#include "log0log.h"
+#include "srv0srv.h"
+#include "transactional_lock_guard.h"
+#include <ostream>
+
+/** @name Modes for buf_page_get_gen */
+/* @{ */
+#define BUF_GET			10	/*!< get always */
+#define	BUF_GET_IF_IN_POOL	11	/*!< get if in pool */
+#define BUF_PEEK_IF_IN_POOL	12	/*!< get if in pool, do not make
+					the block young in the LRU list */
+#define BUF_GET_IF_IN_POOL_OR_WATCH	15
+					/*!< Get the page only if it's in the
+					buffer pool, if not then set a watch
+					on the page. */
+#define BUF_GET_POSSIBLY_FREED		16
+					/*!< Like BUF_GET, but do not mind
+					if the file page has been freed. */
+/* @} */
+
+/** If LRU list of a buf_pool is less than this size then LRU eviction
+should not happen. This is because when we do LRU flushing we also put
+the blocks on free list. If LRU list is very small then we can end up
+in thrashing. */
+#define BUF_LRU_MIN_LEN		256
+
+/** This structure defines information we will fetch from each buffer pool. It
+will be used to print table IO stats */
+struct buf_pool_info_t
+{
+	/* General buffer pool info */
+	ulint	pool_size;		/*!< Buffer Pool size in pages */
+	ulint	lru_len;		/*!< Length of buf_pool.LRU */
+	ulint	old_lru_len;		/*!< buf_pool.LRU_old_len */
+	ulint	free_list_len;		/*!< Length of buf_pool.free list */
+	ulint	flush_list_len;		/*!< Length of buf_pool.flush_list */
+	ulint	n_pend_unzip;		/*!< buf_pool.n_pend_unzip, pages
+					pending decompress */
+	ulint	n_pend_reads;		/*!< os_aio_pending_reads() */
+	ulint	n_pending_flush_lru;	/*!< Pages pending flush in LRU */
+	ulint	n_pending_flush_list;	/*!< Pages pending flush in FLUSH
+					LIST */
+	ulint	n_pages_made_young;	/*!< number of pages made young */
+	ulint	n_pages_not_made_young;	/*!< number of pages not made young */
+	ulint	n_pages_read;		/*!< buf_pool.n_pages_read */
+	ulint	n_pages_created;	/*!< buf_pool.n_pages_created */
+	ulint	n_pages_written;	/*!< buf_pool.n_pages_written */
+	ulint	n_page_gets;		/*!< buf_pool.n_page_gets */
+	ulint	n_ra_pages_read_rnd;	/*!< buf_pool.n_ra_pages_read_rnd,
+					number of pages readahead */
+	ulint	n_ra_pages_read;	/*!< buf_pool.n_ra_pages_read, number
+					of pages readahead */
+	ulint	n_ra_pages_evicted;	/*!< buf_pool.n_ra_pages_evicted,
+					number of readahead pages evicted
+					without access */
+	ulint	n_page_get_delta;	/*!< num of buffer pool page gets since
+					last printout */
+
+	/* Buffer pool access stats */
+	double	page_made_young_rate;	/*!< page made young rate in pages
+					per second */
+	double	page_not_made_young_rate;/*!< page not made young rate
+					in pages per second */
+	double	pages_read_rate;	/*!< num of pages read per second */
+	double	pages_created_rate;	/*!< num of pages create per second */
+	double	pages_written_rate;	/*!< num of  pages written per second */
+	ulint	page_read_delta;	/*!< num of pages read since last
+					printout */
+	ulint	young_making_delta;	/*!< num of pages made young since
+					last printout */
+	ulint	not_young_making_delta;	/*!< num of pages not make young since
+					last printout */
+
+	/* Statistics about read ahead algorithm.  */
+	double	pages_readahead_rnd_rate;/*!< random readahead rate in pages per
+					second */
+	double	pages_readahead_rate;	/*!< readahead rate in pages per
+					second */
+	double	pages_evicted_rate;	/*!< rate of readahead page evicted
+					without access, in pages per second */
+
+	/* Stats about LRU eviction */
+	ulint	unzip_lru_len;		/*!< length of buf_pool.unzip_LRU
+					list */
+	/* Counters for LRU policy */
+	ulint	io_sum;			/*!< buf_LRU_stat_sum.io */
+	ulint	io_cur;			/*!< buf_LRU_stat_cur.io, num of IO
+					for current interval */
+	ulint	unzip_sum;		/*!< buf_LRU_stat_sum.unzip */
+	ulint	unzip_cur;		/*!< buf_LRU_stat_cur.unzip, num
+					pages decompressed in current
+					interval */
+};
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Print the given page_id_t object.
+@param[in,out]	out	the output stream
+@param[in]	page_id	the page_id_t object to be printed
+@return the output stream */
+std::ostream&
+operator<<(
+	std::ostream&		out,
+	const page_id_t		page_id);
+
+#ifndef UNIV_INNOCHECKSUM
+# define buf_pool_get_curr_size() srv_buf_pool_curr_size
+
+/** Allocate a buffer block.
+@return own: the allocated block, state()==MEMORY */
+inline buf_block_t *buf_block_alloc();
+/********************************************************************//**
+Frees a buffer block which does not contain a file page. */
+UNIV_INLINE
+void
+buf_block_free(
+/*===========*/
+	buf_block_t*	block);	/*!< in, own: block to be freed */
+
+#define buf_page_get(ID, SIZE, LA, MTR)					\
+	buf_page_get_gen(ID, SIZE, LA, NULL, BUF_GET, MTR)
+
+/** Try to acquire a page latch.
+@param rw_latch      RW_S_LATCH or RW_X_LATCH
+@param block         guessed block
+@param modify_clock  expected value of block->modify_clock
+@param mtr           mini-transaction
+@return whether the latch was acquired (the page is an allocated file page) */
+bool buf_page_optimistic_get(ulint rw_latch, buf_block_t *block,
+                             uint64_t modify_clock, mtr_t *mtr);
+
+/** Try to S-latch a page.
+Suitable for using when holding the lock_sys latches (as it avoids deadlock).
+@param[in]	page_id	page identifier
+@param[in,out]	mtr	mini-transaction
+@return the block
+@retval nullptr if an S-latch cannot be granted immediately */
+buf_block_t *buf_page_try_get(const page_id_t page_id, mtr_t *mtr);
+
+/** Get read access to a compressed page (usually of type
+FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
+The page must be released with unfix().
+NOTE: the page is not protected by any latch.  Mutual exclusion has to
+be implemented at a higher level.  In other words, all possible
+accesses to a given page through this function must be protected by
+the same set of mutexes or latches.
+@param page_id   page identifier
+@param zip_size  ROW_FORMAT=COMPRESSED page size in bytes
+@return pointer to the block, s-latched */
+buf_page_t *buf_page_get_zip(const page_id_t page_id, ulint zip_size);
+
+/** Get access to a database page. Buffered redo log may be applied.
+@param[in]	page_id			page id
+@param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	rw_latch		RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in]	guess			guessed block or NULL
+@param[in]	mode			BUF_GET, BUF_GET_IF_IN_POOL,
+BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in,out]	mtr			mini-transaction
+@param[out]	err			DB_SUCCESS or error code
+@param[in]	allow_ibuf_merge	Allow change buffer merge while
+reading the pages from file.
+@return pointer to the block or NULL */
+buf_block_t*
+buf_page_get_gen(
+	const page_id_t		page_id,
+	ulint			zip_size,
+	ulint			rw_latch,
+	buf_block_t*		guess,
+	ulint			mode,
+	mtr_t*			mtr,
+	dberr_t*		err = NULL,
+	bool			allow_ibuf_merge = false)
+	MY_ATTRIBUTE((nonnull(6), warn_unused_result));
+
+/** This is the low level function used to get access to a database page.
+@param[in]	page_id			page id
+@param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	rw_latch		RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in]	guess			guessed block or NULL
+@param[in]	mode			BUF_GET, BUF_GET_IF_IN_POOL,
+BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in,out]	mtr			mini-transaction, or NULL if a
+					block with page_id is to be evicted
+@param[out]	err			DB_SUCCESS or error code
+@param[in]	allow_ibuf_merge	Allow change buffer merge to happen
+while reading the page from file
+then it makes sure that it does merging of change buffer changes while
+reading the page from file.
+@return pointer to the block or NULL */
+buf_block_t*
+buf_page_get_low(
+	const page_id_t		page_id,
+	ulint			zip_size,
+	ulint			rw_latch,
+	buf_block_t*		guess,
+	ulint			mode,
+	mtr_t*			mtr,
+	dberr_t*		err,
+	bool			allow_ibuf_merge);
+
+/** Initialize a page in the buffer pool. The page is usually not read
+from a file even if it cannot be found in the buffer buf_pool. This is one
+of the functions which perform to a block a state transition NOT_USED => LRU
+(the other is buf_page_get_low()).
+@param[in,out]	space		space object
+@param[in]	offset		offset of the tablespace
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]	mtr		mini-transaction
+@param[in,out]	free_block	pre-allocated buffer block
+@return pointer to the block, page bufferfixed */
+buf_block_t*
+buf_page_create(fil_space_t *space, uint32_t offset,
+                ulint zip_size, mtr_t *mtr, buf_block_t *free_block);
+
+/** Initialize a page in buffer pool while initializing the
+deferred tablespace
+@param space_id         space identfier
+@param zip_size         ROW_FORMAT=COMPRESSED page size or 0
+@param mtr              mini-transaction
+@param free_block       pre-allocated buffer block
+@return pointer to the block, page bufferfixed */
+buf_block_t*
+buf_page_create_deferred(uint32_t space_id, ulint zip_size, mtr_t *mtr,
+                         buf_block_t *free_block);
+
+/** Move a block to the start of the LRU list. */
+void buf_page_make_young(buf_page_t *bpage);
+/** Mark the page status as FREED for the given tablespace and page number.
+@param[in,out]	space	tablespace
+@param[in]	page	page number
+@param[in,out]	mtr	mini-transaction */
+void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr);
+
+/** Determine if a block is still close enough to the MRU end of the LRU list
+meaning that it is not in danger of getting evicted and also implying
+that it has been accessed recently.
+Note that this is for heuristics only and does not reserve buffer pool
+mutex.
+@param[in]	bpage		buffer pool page
+@return whether bpage is close to MRU end of LRU */
+inline bool buf_page_peek_if_young(const buf_page_t *bpage);
+
+/** Determine if a block should be moved to the start of the LRU list if
+there is danger of dropping from the buffer pool.
+@param[in]	bpage		buffer pool page
+@return true if bpage should be made younger */
+inline bool buf_page_peek_if_too_old(const buf_page_t *bpage);
+
+/** Move a page to the start of the buffer pool LRU list if it is too old.
+@param[in,out]	bpage		buffer pool page */
+inline void buf_page_make_young_if_needed(buf_page_t *bpage)
+{
+	if (UNIV_UNLIKELY(buf_page_peek_if_too_old(bpage))) {
+		buf_page_make_young(bpage);
+	}
+}
+
+/********************************************************************//**
+Increments the modify clock of a frame by 1. The caller must (1) own the
+buf_pool.mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+void
+buf_block_modify_clock_inc(
+/*=======================*/
+	buf_block_t*	block);	/*!< in: block */
+/********************************************************************//**
+Returns the value of the modify clock. The caller must have an s-lock
+or x-lock on the block.
+@return value */
+UNIV_INLINE
+ib_uint64_t
+buf_block_get_modify_clock(
+/*=======================*/
+	buf_block_t*	block);	/*!< in: block */
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Check if a buffer is all zeroes.
+@param[in]	buf	data to check
+@return whether the buffer is all zeroes */
+bool buf_is_zeroes(st_::span<const byte> buf);
+
+/** Check if a page is corrupt.
+@param check_lsn   whether FIL_PAGE_LSN should be checked
+@param read_buf    database page
+@param fsp_flags   contents of FIL_SPACE_FLAGS
+@return whether the page is corrupted */
+bool buf_page_is_corrupted(bool check_lsn, const byte *read_buf,
+                           uint32_t fsp_flags)
+  MY_ATTRIBUTE((warn_unused_result));
+
+/** Read the key version from the page. In full crc32 format,
+key version is stored at {0-3th} bytes. In other format, it is
+stored in 26th position.
+@param[in]	read_buf	database page
+@param[in]	fsp_flags	tablespace flags
+@return key version of the page. */
+inline uint32_t buf_page_get_key_version(const byte* read_buf,
+                                         uint32_t fsp_flags)
+{
+  static_assert(FIL_PAGE_FCRC32_KEY_VERSION == 0, "compatibility");
+  return fil_space_t::full_crc32(fsp_flags)
+    ? mach_read_from_4(my_assume_aligned<4>(read_buf))
+    : mach_read_from_4(my_assume_aligned<2>
+		       (read_buf + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION));
+}
+
+/** Read the compression info from the page. In full crc32 format,
+compression info is at MSB of page type. In other format, it is
+stored in page type.
+@param[in]	read_buf	database page
+@param[in]	fsp_flags	tablespace flags
+@return true if page is compressed. */
+inline bool buf_page_is_compressed(const byte* read_buf, uint32_t fsp_flags)
+{
+  uint16_t page_type= fil_page_get_type(read_buf);
+  return fil_space_t::full_crc32(fsp_flags)
+    ? !!(page_type & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER)
+    : page_type == FIL_PAGE_PAGE_COMPRESSED;
+}
+
+/** Get the compressed or uncompressed size of a full_crc32 page.
+@param[in]	buf	page_compressed or uncompressed page
+@param[out]	comp	whether the page could be compressed
+@param[out]	cr	whether the page could be corrupted
+@return the payload size in the file page */
+inline uint buf_page_full_crc32_size(const byte* buf, bool* comp, bool* cr)
+{
+	uint t = fil_page_get_type(buf);
+	uint page_size = uint(srv_page_size);
+
+	if (!(t & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER)) {
+		return page_size;
+	}
+
+	t &= ~(1U << FIL_PAGE_COMPRESS_FCRC32_MARKER);
+	t <<= 8;
+
+	if (t < page_size) {
+		page_size = t;
+		if (comp) {
+			*comp = true;
+		}
+	} else if (cr) {
+		*cr = true;
+	}
+
+	return page_size;
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/** Dump a page to stderr.
+@param[in]	read_buf	database page
+@param[in]	zip_size	compressed page size, or 0 */
+void buf_page_print(const byte* read_buf, ulint zip_size = 0)
+	ATTRIBUTE_COLD __attribute__((nonnull));
+/********************************************************************//**
+Decompress a block.
+@return TRUE if successful */
+ibool
+buf_zip_decompress(
+/*===============*/
+	buf_block_t*	block,	/*!< in/out: block */
+	ibool		check);	/*!< in: TRUE=verify the page checksum */
+
+#ifdef UNIV_DEBUG
+/** @return the number of latched pages in the buffer pool */
+ulint buf_get_latched_pages_number();
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+void
+buf_print_io(
+/*=========*/
+	FILE*	file);	/*!< in: file where to print */
+/** Collect buffer pool metadata.
+@param[out]	pool_info	buffer pool metadata */
+void buf_stats_get_pool_info(buf_pool_info_t *pool_info);
+
+/** Refresh the statistics used to print per-second averages. */
+void buf_refresh_io_stats();
+
+/** Invalidate all pages in the buffer pool.
+All pages must be in a replaceable state (not modified or latched). */
+void buf_pool_invalidate();
+
+/*========================================================================
+--------------------------- LOWER LEVEL ROUTINES -------------------------
+=========================================================================*/
+
+#define buf_block_get_frame(block) (block)->page.frame
+
+/*********************************************************************//**
+Gets the compressed page descriptor corresponding to an uncompressed page
+if applicable. */
+#define buf_block_get_page_zip(block) \
+	(UNIV_LIKELY_NULL((block)->page.zip.data) ? &(block)->page.zip : NULL)
+#define is_buf_block_get_page_zip(block) \
+        UNIV_LIKELY_NULL((block)->page.zip.data)
+
+/** Monitor the buffer page read/write activity, and increment corresponding
+counter value in MONITOR_MODULE_BUF_PAGE.
+@param bpage   buffer page whose read or write was completed
+@param read    true=read, false=write */
+ATTRIBUTE_COLD void buf_page_monitor(const buf_page_t &bpage, bool read);
+
+/** Calculate aligned buffer pool size based on srv_buf_pool_chunk_unit,
+if needed.
+@param[in]	size	size in bytes
+@return	aligned size */
+ulint
+buf_pool_size_align(
+	ulint	size);
+
+/** Verify that post encryption checksum match with the calculated checksum.
+This function should be called only if tablespace contains crypt data metadata.
+@param page       page frame
+@param fsp_flags  contents of FSP_SPACE_FLAGS
+@return whether the page is encrypted and valid */
+bool buf_page_verify_crypt_checksum(const byte *page, uint32_t fsp_flags);
+
+/** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page.
+@param[in,out]	page		page to update
+@param[in]	size		compressed page size */
+void buf_flush_update_zip_checksum(buf_frame_t* page, ulint size);
+
+/** @brief The temporary memory structure.
+
+NOTE! The definition appears here only for other modules of this
+directory (buf) to see it. Do not use from outside! */
+
+class buf_tmp_buffer_t
+{
+  /** whether this slot is reserved */
+  std::atomic<bool> reserved;
+public:
+  /** For encryption, the data needs to be copied to a separate buffer
+  before it's encrypted&written. The buffer block itself can be replaced
+  while a write of crypt_buf to file is in progress. */
+  byte *crypt_buf;
+  /** buffer for fil_page_compress(), for flushing page_compressed pages */
+  byte *comp_buf;
+  /** pointer to resulting buffer after encryption or compression;
+  not separately allocated memory */
+  byte *out_buf;
+
+  /** Release the slot */
+  void release() { reserved.store(false, std::memory_order_relaxed); }
+
+  /** Acquire the slot
+  @return whether the slot was acquired */
+  bool acquire() { return !reserved.exchange(true, std::memory_order_relaxed);}
+
+  /** Allocate a buffer for encryption, decryption or decompression. */
+  void allocate()
+  {
+    if (!crypt_buf)
+      crypt_buf= static_cast<byte*>
+      (aligned_malloc(srv_page_size, srv_page_size));
+  }
+};
+
+/** The common buffer control block structure
+for compressed and uncompressed frames */
+
+class buf_pool_t;
+
+class buf_page_t
+{
+  friend buf_pool_t;
+  friend buf_block_t;
+
+  /** @name General fields */
+  /* @{ */
+
+public: // FIXME: fix fil_iterate()
+  /** Page id. Protected by buf_pool.page_hash.lock_get() when
+  the page is in buf_pool.page_hash. */
+  page_id_t id_;
+  /** buf_pool.page_hash link; protected by buf_pool.page_hash.lock_get() */
+  buf_page_t *hash;
+private:
+  /** log sequence number of the START of the log entry written of the
+  oldest modification to this block which has not yet been written
+  to the data file;
+
+  0 if no modifications are pending;
+  1 if no modifications are pending, but the block is in buf_pool.flush_list;
+  2 if modifications are pending, but the block is not in buf_pool.flush_list
+  (because id().space() is the temporary tablespace). */
+  Atomic_relaxed<lsn_t> oldest_modification_;
+
+public:
+  /** state() of unused block (in buf_pool.free list) */
+  static constexpr uint32_t NOT_USED= 0;
+  /** state() of block allocated as general-purpose memory */
+  static constexpr uint32_t MEMORY= 1;
+  /** state() of block that is being freed */
+  static constexpr uint32_t REMOVE_HASH= 2;
+  /** smallest state() of a buffer page that is freed in the tablespace */
+  static constexpr uint32_t FREED= 3;
+  /** smallest state() for a block that belongs to buf_pool.LRU */
+  static constexpr uint32_t UNFIXED= 1U << 29;
+  /** smallest state() of a block for which buffered changes may exist */
+  static constexpr uint32_t IBUF_EXIST= 2U << 29;
+  /** smallest state() of a (re)initialized page (no doublewrite needed) */
+  static constexpr uint32_t REINIT= 3U << 29;
+  /** smallest state() for an io-fixed block */
+  static constexpr uint32_t READ_FIX= 4U << 29;
+  /** smallest state() for a write-fixed block */
+  static constexpr uint32_t WRITE_FIX= 5U << 29;
+  /** smallest state() for a write-fixed block with buffered changes */
+  static constexpr uint32_t WRITE_FIX_IBUF= 6U << 29;
+  /** smallest state() for a write-fixed block (no doublewrite was used) */
+  static constexpr uint32_t WRITE_FIX_REINIT= 7U << 29;
+  /** buf_pool.LRU status mask in state() */
+  static constexpr uint32_t LRU_MASK= 7U << 29;
+
+  /** lock covering the contents of frame */
+  block_lock lock;
+  /** pointer to aligned, uncompressed page frame of innodb_page_size */
+  byte *frame;
+  /* @} */
+  /** ROW_FORMAT=COMPRESSED page; zip.data (but not the data it points to)
+  is also protected by buf_pool.mutex;
+  !frame && !zip.data means an active buf_pool.watch */
+  page_zip_des_t zip;
+#ifdef UNIV_DEBUG
+  /** whether this->list is in buf_pool.zip_hash; protected by buf_pool.mutex */
+  bool in_zip_hash;
+  /** whether this->LRU is in buf_pool.LRU (in_file());
+  protected by buf_pool.mutex */
+  bool in_LRU_list;
+  /** whether this is in buf_pool.page_hash (in_file());
+  protected by buf_pool.mutex */
+  bool in_page_hash;
+  /** whether this->list is in buf_pool.free (state() == NOT_USED);
+  protected by buf_pool.flush_list_mutex */
+  bool in_free_list;
+#endif /* UNIV_DEBUG */
+  /** list member in one of the lists of buf_pool; protected by
+  buf_pool.mutex or buf_pool.flush_list_mutex
+
+  state() == NOT_USED: buf_pool.free or buf_pool.withdraw
+
+  in_file() && oldest_modification():
+  buf_pool.flush_list (protected by buf_pool.flush_list_mutex)
+
+  The contents is undefined if in_file() && !oldest_modification(),
+  or if state() == MEMORY or state() == REMOVE_HASH. */
+  UT_LIST_NODE_T(buf_page_t) list;
+
+	/** @name LRU replacement algorithm fields.
+	Protected by buf_pool.mutex. */
+	/* @{ */
+
+	UT_LIST_NODE_T(buf_page_t) LRU;
+					/*!< node of the LRU list */
+	unsigned	old:1;		/*!< TRUE if the block is in the old
+					blocks in buf_pool.LRU_old */
+	unsigned	freed_page_clock:31;/*!< the value of
+					buf_pool.freed_page_clock
+					when this block was the last
+					time put to the head of the
+					LRU list; a thread is allowed
+					to read this for heuristic
+					purposes without holding any
+					mutex or latch */
+	/* @} */
+	Atomic_counter<unsigned> access_time;	/*!< time of first access, or
+					0 if the block was never accessed
+					in the buffer pool.
+
+					For state() == MEMORY
+					blocks, this field can be repurposed
+					for something else.
+
+					When this field counts log records
+					and bytes allocated for recv_sys.pages,
+					the field is protected by
+					recv_sys_t::mutex. */
+  buf_page_t() : id_{0}
+  {
+    static_assert(NOT_USED == 0, "compatibility");
+    memset((void*) this, 0, sizeof *this);
+  }
+
+  buf_page_t(const buf_page_t &b) :
+    id_(b.id_), hash(b.hash),
+    oldest_modification_(b.oldest_modification_),
+    lock() /* not copied */,
+    frame(b.frame), zip(b.zip),
+#ifdef UNIV_DEBUG
+    in_zip_hash(b.in_zip_hash), in_LRU_list(b.in_LRU_list),
+    in_page_hash(b.in_page_hash), in_free_list(b.in_free_list),
+#endif /* UNIV_DEBUG */
+    list(b.list), LRU(b.LRU), old(b.old), freed_page_clock(b.freed_page_clock),
+    access_time(b.access_time)
+  {
+    lock.init();
+  }
+
+  /** Initialize some more fields */
+  void init(uint32_t state, page_id_t id)
+  {
+    ut_ad(state < REMOVE_HASH || state >= UNFIXED);
+    id_= id;
+    zip.fix= state;
+    oldest_modification_= 0;
+    lock.init();
+    ut_d(in_zip_hash= false);
+    ut_d(in_free_list= false);
+    ut_d(in_LRU_list= false);
+    ut_d(in_page_hash= false);
+    old= 0;
+    freed_page_clock= 0;
+    access_time= 0;
+  }
+
+  void set_os_unused()
+  {
+    MEM_NOACCESS(frame, srv_page_size);
+#ifdef MADV_FREE
+    madvise(frame, srv_page_size, MADV_FREE);
+#endif
+  }
+
+  void set_os_used() const
+  {
+    MEM_MAKE_ADDRESSABLE(frame, srv_page_size);
+  }
+public:
+  const page_id_t &id() const { return id_; }
+  uint32_t state() const { return zip.fix; }
+  uint32_t buf_fix_count() const
+  {
+    uint32_t f= state();
+    ut_ad(f >= FREED);
+    return f < UNFIXED ? (f - FREED) : (~LRU_MASK & f);
+  }
+  /** @return whether this block is read or write fixed;
+  read_complete() or write_complete() will always release
+  the io-fix before releasing U-lock or X-lock */
+  bool is_io_fixed() const
+  { const auto s= state(); ut_ad(s >= FREED); return s >= READ_FIX; }
+  /** @return whether this block is write fixed;
+  write_complete() will always release the write-fix before releasing U-lock */
+  bool is_write_fixed() const { return state() >= WRITE_FIX; }
+  /** @return whether this block is read fixed; this should never hold
+  when a thread is holding the block lock in any mode */
+  bool is_read_fixed() const { return is_io_fixed() && !is_write_fixed(); }
+
+  /** @return if this belongs to buf_pool.unzip_LRU */
+  bool belongs_to_unzip_LRU() const
+  { return UNIV_LIKELY_NULL(zip.data) && frame; }
+
+  bool is_freed() const
+  { const auto s= state(); ut_ad(s >= FREED); return s < UNFIXED; }
+  bool is_ibuf_exist() const
+  {
+    const auto s= state();
+    ut_ad(s >= UNFIXED);
+    ut_ad(s < READ_FIX);
+    return (s & LRU_MASK) == IBUF_EXIST;
+  }
+  bool is_reinit() const { return !(~state() & REINIT); }
+
+  void set_reinit(uint32_t prev_state)
+  {
+    ut_ad(prev_state < READ_FIX);
+    ut_d(const auto s=) zip.fix.fetch_add(REINIT - prev_state);
+    ut_ad(s > prev_state);
+    ut_ad(s < prev_state + UNFIXED);
+  }
+
+  void set_ibuf_exist()
+  {
+    ut_ad(lock.is_write_locked());
+    ut_ad(id() < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0));
+    const auto s= state();
+    ut_ad(s >= UNFIXED);
+    ut_ad(s < READ_FIX);
+    ut_ad(s < IBUF_EXIST || s >= REINIT);
+    zip.fix.fetch_add(IBUF_EXIST - (LRU_MASK & s));
+  }
+  void clear_ibuf_exist()
+  {
+    ut_ad(lock.is_write_locked());
+    ut_ad(id() < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0));
+    ut_d(const auto s=) zip.fix.fetch_sub(IBUF_EXIST - UNFIXED);
+    ut_ad(s >= IBUF_EXIST);
+    ut_ad(s < REINIT);
+  }
+
+  uint32_t read_unfix(uint32_t s)
+  {
+    ut_ad(lock.is_write_locked());
+    ut_ad(s == UNFIXED + 1 || s == IBUF_EXIST + 1 || s == REINIT + 1);
+    uint32_t old_state= zip.fix.fetch_add(s - READ_FIX);
+    ut_ad(old_state >= READ_FIX);
+    ut_ad(old_state < WRITE_FIX);
+    return old_state + (s - READ_FIX);
+  }
+
+  void set_freed(uint32_t prev_state, uint32_t count= 0)
+  {
+    ut_ad(lock.is_write_locked());
+    ut_ad(prev_state >= UNFIXED);
+    ut_ad(prev_state < READ_FIX);
+    ut_d(auto s=) zip.fix.fetch_sub((prev_state & LRU_MASK) - FREED - count);
+    ut_ad(!((prev_state ^ s) & LRU_MASK));
+  }
+
+  inline void set_state(uint32_t s);
+  inline void set_corrupt_id();
+
+  /** @return the log sequence number of the oldest pending modification
+  @retval 0 if the block is being removed from (or not in) buf_pool.flush_list
+  @retval 1 if the block is in buf_pool.flush_list but not modified
+  @retval 2 if the block belongs to the temporary tablespace and
+  has unwritten changes */
+  lsn_t oldest_modification() const { return oldest_modification_; }
+  /** @return the log sequence number of the oldest pending modification,
+  @retval 0 if the block is definitely not in buf_pool.flush_list
+  @retval 1 if the block is in buf_pool.flush_list but not modified
+  @retval 2 if the block belongs to the temporary tablespace and
+  has unwritten changes */
+  lsn_t oldest_modification_acquire() const
+  { return oldest_modification_.load(std::memory_order_acquire); }
+  /** Set oldest_modification when adding to buf_pool.flush_list */
+  inline void set_oldest_modification(lsn_t lsn);
+  /** Clear oldest_modification after removing from buf_pool.flush_list */
+  inline void clear_oldest_modification();
+  /** Reset the oldest_modification when marking a persistent page freed */
+  void reset_oldest_modification()
+  {
+    ut_ad(oldest_modification() > 2);
+    oldest_modification_.store(1, std::memory_order_release);
+  }
+
+  /** Complete a read of a page.
+  @param node     data file
+  @return whether the operation succeeded
+  @retval DB_PAGE_CORRUPTED    if the checksum fails
+  @retval DB_DECRYPTION_FAILED if the page cannot be decrypted
+  @retval DB_FAIL              if the page contains the wrong ID */
+  dberr_t read_complete(const fil_node_t &node);
+
+  /** Note that a block is no longer dirty, while not removing
+  it from buf_pool.flush_list
+  @param temporary   whether the page belongs to the temporary tablespace
+  @param error       whether an error may have occurred while writing */
+  inline void write_complete(bool temporary, bool error);
+
+  /** Write a flushable page to a file or free a freeable block.
+  @param evict       whether to evict the page on write completion
+  @param space       tablespace
+  @return whether a page write was initiated and buf_pool.mutex released */
+  bool flush(bool evict, fil_space_t *space);
+
+  /** Notify that a page in a temporary tablespace has been modified. */
+  void set_temp_modified()
+  {
+    ut_ad(fsp_is_system_temporary(id().space()));
+    ut_ad(in_file());
+    ut_ad((oldest_modification() | 2) == 2);
+    oldest_modification_= 2;
+  }
+
+  /** Prepare to release a file page to buf_pool.free. */
+  void free_file_page()
+  {
+    ut_ad((zip.fix.fetch_sub(REMOVE_HASH - MEMORY)) == REMOVE_HASH);
+    /* buf_LRU_block_free_non_file_page() asserts !oldest_modification() */
+    ut_d(oldest_modification_= 0;)
+    id_= page_id_t(~0ULL);
+  }
+
+  void fix_on_recovery()
+  {
+    ut_d(const auto f=) zip.fix.fetch_sub(READ_FIX - UNFIXED - 1);
+    ut_ad(f >= READ_FIX);
+    ut_ad(f < WRITE_FIX);
+  }
+
+  uint32_t fix(uint32_t count= 1)
+  {
+    ut_ad(count);
+    ut_ad(count < IBUF_EXIST);
+    uint32_t f= zip.fix.fetch_add(count);
+    ut_ad(f >= FREED);
+    ut_ad(!((f ^ (f + 1)) & LRU_MASK));
+    return f;
+  }
+
+  uint32_t unfix()
+  {
+    uint32_t f= zip.fix.fetch_sub(1);
+    ut_ad(f > FREED);
+    ut_ad(!((f ^ (f - 1)) & LRU_MASK));
+    return f - 1;
+  }
+
+  /** @return the physical size, in bytes */
+  ulint physical_size() const
+  {
+    return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : srv_page_size;
+  }
+
+  /** @return the ROW_FORMAT=COMPRESSED physical size, in bytes
+  @retval 0 if not compressed */
+  ulint zip_size() const
+  {
+    return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : 0;
+  }
+
+  /** @return the byte offset of the page within a file */
+  os_offset_t physical_offset() const
+  {
+    os_offset_t o= id().page_no();
+    return zip.ssize
+      ? o << (zip.ssize + (UNIV_ZIP_SIZE_SHIFT_MIN - 1))
+      : o << srv_page_size_shift;
+  }
+
+  /** @return whether the block is mapped to a data file */
+  bool in_file() const { return state() >= FREED; }
+
+  /** @return whether the block can be relocated in memory.
+  The block can be dirty, but it must not be I/O-fixed or bufferfixed. */
+  inline bool can_relocate() const;
+  /** @return whether the block has been flagged old in buf_pool.LRU */
+  inline bool is_old() const;
+  /** Set whether a block is old in buf_pool.LRU */
+  inline void set_old(bool old);
+  /** Flag a page accessed in buf_pool
+  @return whether this is not the first access */
+  bool set_accessed()
+  {
+    if (is_accessed()) return true;
+    access_time= static_cast<uint32_t>(ut_time_ms());
+    return false;
+  }
+  /** @return ut_time_ms() at the time of first access of a block in buf_pool
+  @retval 0 if not accessed */
+  unsigned is_accessed() const { ut_ad(in_file()); return access_time; }
+};
+
+/** The buffer control block structure */
+
+struct buf_block_t{
+
+	/** @name General fields */
+	/* @{ */
+
+	buf_page_t	page;		/*!< page information; this must
+					be the first field, so that
+					buf_pool.page_hash can point
+					to buf_page_t or buf_block_t */
+#ifdef UNIV_DEBUG
+  /** whether page.list is in buf_pool.withdraw
+  ((state() == NOT_USED)) and the buffer pool is being shrunk;
+  protected by buf_pool.mutex */
+  bool in_withdraw_list;
+  /** whether unzip_LRU is in buf_pool.unzip_LRU
+  (in_file() && frame && zip.data);
+  protected by buf_pool.mutex */
+  bool in_unzip_LRU_list;
+#endif
+  /** member of buf_pool.unzip_LRU (if belongs_to_unzip_LRU()) */
+  UT_LIST_NODE_T(buf_block_t) unzip_LRU;
+	/* @} */
+	/** @name Optimistic search field */
+	/* @{ */
+
+	ib_uint64_t	modify_clock;	/*!< this clock is incremented every
+					time a pointer to a record on the
+					page may become obsolete; this is
+					used in the optimistic cursor
+					positioning: if the modify clock has
+					not changed, we know that the pointer
+					is still valid; this field may be
+					changed if the thread (1) owns the
+					pool mutex and the page is not
+					bufferfixed, or (2) the thread has an
+					x-latch on the block */
+	/* @} */
+#ifdef BTR_CUR_HASH_ADAPT
+	/** @name Hash search fields (unprotected)
+	NOTE that these fields are NOT protected by any semaphore! */
+	/* @{ */
+
+	volatile uint16_t n_bytes;	/*!< recommended prefix length for hash
+					search: number of bytes in
+					an incomplete last field */
+	volatile uint16_t n_fields;	/*!< recommended prefix length for hash
+					search: number of full fields */
+	uint16_t	n_hash_helps;	/*!< counter which controls building
+					of a new hash index for the page */
+	volatile bool	left_side;	/*!< true or false, depending on
+					whether the leftmost record of several
+					records with the same prefix should be
+					indexed in the hash index */
+	/* @} */
+
+	/** @name Hash search fields
+	These 5 fields may only be modified when:
+	we are holding the appropriate x-latch in btr_search_latches[], and
+	one of the following holds:
+	(1) in_file(), and we are holding lock in any mode, or
+	(2) !is_read_fixed()&&(state()>=UNFIXED||state()==REMOVE_HASH).
+
+	An exception to this is when we init or create a page
+	in the buffer pool in buf0buf.cc.
+
+	Another exception for buf_pool_t::clear_hash_index() is that
+	assigning block->index = NULL (and block->n_pointers = 0)
+	is allowed whenever all AHI latches are exclusively locked.
+
+	Another exception is that ha_insert_for_fold() may
+	decrement n_pointers without holding the appropriate latch
+	in btr_search_latches[]. Thus, n_pointers must be
+	protected by atomic memory access.
+
+	This implies that the fields may be read without race
+	condition whenever any of the following hold:
+	- the btr_search_sys.partition[].latch is being held, or
+	- state() == NOT_USED || state() == MEMORY,
+	and holding some latch prevents the state from changing to that.
+
+	Some use of assert_block_ahi_empty() or assert_block_ahi_valid()
+	is prone to race conditions while buf_pool_t::clear_hash_index() is
+	executing (the adaptive hash index is being disabled). Such use
+	is explicitly commented. */
+
+	/* @{ */
+
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	Atomic_counter<ulint>
+			n_pointers;	/*!< used in debugging: the number of
+					pointers in the adaptive hash index
+					pointing to this frame */
+#  define assert_block_ahi_empty(block)					\
+	ut_a((block)->n_pointers == 0)
+#  define assert_block_ahi_empty_on_init(block) do {			\
+	MEM_MAKE_DEFINED(&(block)->n_pointers, sizeof (block)->n_pointers); \
+	assert_block_ahi_empty(block);					\
+} while (0)
+#  define assert_block_ahi_valid(block)					\
+	ut_a((block)->index || (block)->n_pointers == 0)
+# else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+#  define assert_block_ahi_empty(block) /* nothing */
+#  define assert_block_ahi_empty_on_init(block) /* nothing */
+#  define assert_block_ahi_valid(block) /* nothing */
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	unsigned	curr_n_fields:10;/*!< prefix length for hash indexing:
+					number of full fields */
+	unsigned	curr_n_bytes:15;/*!< number of bytes in hash
+					indexing */
+	unsigned	curr_left_side:1;/*!< TRUE or FALSE in hash indexing */
+	dict_index_t*	index;		/*!< Index for which the
+					adaptive hash index has been
+					created, or NULL if the page
+					does not exist in the
+					index. Note that it does not
+					guarantee that the index is
+					complete, though: there may
+					have been hash collisions,
+					record deletions, etc. */
+	/* @} */
+#else /* BTR_CUR_HASH_ADAPT */
+# define assert_block_ahi_empty(block) /* nothing */
+# define assert_block_ahi_empty_on_init(block) /* nothing */
+# define assert_block_ahi_valid(block) /* nothing */
+#endif /* BTR_CUR_HASH_ADAPT */
+  void fix() { page.fix(); }
+  uint32_t unfix() { return page.unfix(); }
+
+  /** @return the physical size, in bytes */
+  ulint physical_size() const { return page.physical_size(); }
+
+  /** @return the ROW_FORMAT=COMPRESSED physical size, in bytes
+  @retval 0 if not compressed */
+  ulint zip_size() const { return page.zip_size(); }
+
+  /** Initialize the block.
+  @param page_id  page identifier
+  @param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+  @param state    initial state() */
+  void initialise(const page_id_t page_id, ulint zip_size, uint32_t state);
+};
+
+/**********************************************************************//**
+Compute the hash fold value for blocks in buf_pool.zip_hash. */
+/* @{ */
+#define BUF_POOL_ZIP_FOLD_PTR(ptr) (ulint(ptr) >> srv_page_size_shift)
+#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->page.frame)
+#define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b))
+/* @} */
+
+/** A "Hazard Pointer" class used to iterate over buf_pool.LRU or
+buf_pool.flush_list. A hazard pointer is a buf_page_t pointer
+which we intend to iterate over next and we want it remain valid
+even after we release the mutex that protects the list. */
+class HazardPointer
+{
+public:
+  virtual ~HazardPointer() = default;
+
+  /** @return current value */
+  buf_page_t *get() const { mysql_mutex_assert_owner(m_mutex); return m_hp; }
+
+  /** Set current value
+  @param bpage buffer block to be set as hp */
+  void set(buf_page_t *bpage)
+  {
+    mysql_mutex_assert_owner(m_mutex);
+    ut_ad(!bpage || bpage->in_file());
+    m_hp= bpage;
+  }
+
+  /** Checks if a bpage is the hp
+  @param bpage  buffer block to be compared
+  @return true if it is hp */
+  bool is_hp(const buf_page_t *bpage) const
+  { mysql_mutex_assert_owner(m_mutex); return bpage == m_hp; }
+
+  /** Adjust the value of hp. This happens when some
+  other thread working on the same list attempts to
+  remove the hp from the list. */
+  virtual void adjust(const buf_page_t*) = 0;
+
+#ifdef UNIV_DEBUG
+  /** mutex that protects access to the m_hp. */
+  const mysql_mutex_t *m_mutex= nullptr;
+#endif /* UNIV_DEBUG */
+
+protected:
+  /** hazard pointer */
+  buf_page_t *m_hp= nullptr;
+};
+
+/** Class implementing buf_pool.flush_list hazard pointer */
+class FlushHp : public HazardPointer
+{
+public:
+  ~FlushHp() override = default;
+
+  /** Adjust the value of hp. This happens when some
+  other thread working on the same list attempts to
+  remove the hp from the list.
+  @param bpage  buffer block to be compared */
+  MY_ATTRIBUTE((nonnull))
+  void adjust(const buf_page_t *bpage) override
+  {
+    /* We only support reverse traversal for now. */
+    if (is_hp(bpage))
+      m_hp= UT_LIST_GET_PREV(list, m_hp);
+
+    ut_ad(!m_hp || m_hp->oldest_modification());
+  }
+};
+
+/** Class implementing buf_pool.LRU hazard pointer */
+class LRUHp : public HazardPointer {
+public:
+  ~LRUHp() override = default;
+
+  /** Adjust the value of hp. This happens when some
+  other thread working on the same list attempts to
+  remove the hp from the list.
+  @param bpage  buffer block to be compared */
+  MY_ATTRIBUTE((nonnull))
+  void adjust(const buf_page_t *bpage) override
+  {
+    /** We only support reverse traversal for now. */
+    if (is_hp(bpage))
+      m_hp= UT_LIST_GET_PREV(LRU, m_hp);
+
+    ut_ad(!m_hp || m_hp->in_LRU_list);
+  }
+};
+
+/** Special purpose iterators to be used when scanning the LRU list.
+The idea is that when one thread finishes the scan it leaves the
+itr in that position and the other thread can start scan from
+there */
+class LRUItr : public LRUHp {
+public:
+  ~LRUItr() override = default;
+
+  /** Select from where to start a scan. If we have scanned
+  too deep into the LRU list it resets the value to the tail
+  of the LRU list.
+  @return buf_page_t from where to start scan. */
+  inline buf_page_t *start();
+};
+
+/** Struct that is embedded in the free zip blocks */
+struct buf_buddy_free_t {
+	union {
+		ulint	size;	/*!< size of the block */
+		byte	bytes[FIL_PAGE_DATA];
+				/*!< stamp[FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID]
+				== BUF_BUDDY_FREE_STAMP denotes a free
+				block. If the space_id field of buddy
+				block != BUF_BUDDY_FREE_STAMP, the block
+				is not in any zip_free list. If the
+				space_id is BUF_BUDDY_FREE_STAMP then
+				stamp[0] will contain the
+				buddy block size. */
+	} stamp;
+
+	buf_page_t	bpage;	/*!< Embedded bpage descriptor */
+	UT_LIST_NODE_T(buf_buddy_free_t) list;
+				/*!< Node of zip_free list */
+};
+
+/** @brief The buffer pool statistics structure;
+protected by buf_pool.mutex unless otherwise noted. */
+struct buf_pool_stat_t{
+	/** Initialize the counters */
+	void init() { memset((void*) this, 0, sizeof *this); }
+
+	ib_counter_t<ulint, ib_counter_element_t>	n_page_gets;
+				/*!< number of page gets performed;
+				also successful searches through
+				the adaptive hash index are
+				counted as page gets;
+				NOT protected by buf_pool.mutex */
+	ulint	n_pages_read;	/*!< number read operations */
+	ulint	n_pages_written;/*!< number write operations */
+	ulint	n_pages_created;/*!< number of pages created
+				in the pool with no read */
+	ulint	n_ra_pages_read_rnd;/*!< number of pages read in
+				as part of random read ahead */
+	ulint	n_ra_pages_read;/*!< number of pages read in
+				as part of read ahead */
+	ulint	n_ra_pages_evicted;/*!< number of read ahead
+				pages that are evicted without
+				being accessed */
+	ulint	n_pages_made_young; /*!< number of pages made young, in
+				buf_page_make_young() */
+	ulint	n_pages_not_made_young; /*!< number of pages not made
+				young because the first access
+				was not long enough ago, in
+				buf_page_peek_if_too_old() */
+	/** number of waits for eviction */
+	ulint	LRU_waits;
+	ulint	LRU_bytes;	/*!< LRU size in bytes */
+};
+
+/** Statistics of buddy blocks of a given size. */
+struct buf_buddy_stat_t {
+	/** Number of blocks allocated from the buddy system. */
+	ulint		used;
+	/** Number of blocks relocated by the buddy system. */
+	ib_uint64_t	relocated;
+	/** Total duration of block relocations, in microseconds. */
+	ib_uint64_t	relocated_usec;
+};
+
+/** The buffer pool */
+class buf_pool_t
+{
+  /** A chunk of buffers */
+  struct chunk_t
+  {
+    /** number of elements in blocks[] */
+    size_t size;
+    /** memory allocated for the page frames */
+    unsigned char *mem;
+    /** descriptor of mem */
+    ut_new_pfx_t mem_pfx;
+    /** array of buffer control blocks */
+    buf_block_t *blocks;
+
+    /** Map of first page frame address to chunks[] */
+    using map= std::map<const void*, chunk_t*, std::less<const void*>,
+                        ut_allocator<std::pair<const void* const,chunk_t*>>>;
+    /** Chunk map that may be under construction by buf_resize_thread() */
+    static map *map_reg;
+    /** Current chunk map for lookup only */
+    static map *map_ref;
+
+    /** @return the memory size bytes. */
+    size_t mem_size() const { return mem_pfx.m_size; }
+
+    /** Register the chunk */
+    void reg() { map_reg->emplace(map::value_type(blocks->page.frame, this)); }
+
+    /** Allocate a chunk of buffer frames.
+    @param bytes    requested size
+    @return whether the allocation succeeded */
+    inline bool create(size_t bytes);
+
+#ifdef UNIV_DEBUG
+    /** Find a block that points to a ROW_FORMAT=COMPRESSED page
+    @param data  pointer to the start of a ROW_FORMAT=COMPRESSED page frame
+    @return the block
+    @retval nullptr  if not found */
+    const buf_block_t *contains_zip(const void *data) const
+    {
+      const buf_block_t *block= blocks;
+      for (auto i= size; i--; block++)
+        if (block->page.zip.data == data)
+          return block;
+      return nullptr;
+    }
+
+    /** Check that all blocks are in a replaceable state.
+    @return address of a non-free block
+    @retval nullptr if all freed */
+    inline const buf_block_t *not_freed() const;
+#endif /* UNIV_DEBUG */
+  };
+public:
+  /** Hash cell chain in page_hash_table */
+  struct hash_chain
+  {
+    /** pointer to the first block */
+    buf_page_t *first;
+  };
+private:
+  /** Withdraw blocks from the buffer pool until meeting withdraw_target.
+  @return whether retry is needed */
+  inline bool withdraw_blocks();
+
+  /** Determine if a pointer belongs to a buf_block_t. It can be a pointer to
+  the buf_block_t itself or a member of it.
+  @param ptr    a pointer that will not be dereferenced
+  @return whether the ptr belongs to a buf_block_t struct */
+  bool is_block_field(const void *ptr) const
+  {
+    const chunk_t *chunk= chunks;
+    const chunk_t *const echunk= chunk + ut_min(n_chunks, n_chunks_new);
+
+    /* TODO: protect chunks with a mutex (the older pointer will
+    currently remain during resize()) */
+    for (; chunk < echunk; chunk++)
+      if (ptr >= reinterpret_cast<const void*>(chunk->blocks) &&
+          ptr < reinterpret_cast<const void*>(chunk->blocks + chunk->size))
+        return true;
+    return false;
+  }
+
+  /** Try to reallocate a control block.
+  @param block  control block to reallocate
+  @return whether the reallocation succeeded */
+  inline bool realloc(buf_block_t *block);
+
+public:
+  bool is_initialised() const { return chunks != nullptr; }
+
+  /** Create the buffer pool.
+  @return whether the creation failed */
+  bool create();
+
+  /** Clean up after successful create() */
+  void close();
+
+  /** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */
+  inline void resize();
+
+  /** @return whether resize() is in progress */
+  bool resize_in_progress() const
+  {
+    return UNIV_UNLIKELY(resizing.load(std::memory_order_relaxed));
+  }
+
+  /** @return the current size in blocks */
+  size_t get_n_pages() const
+  {
+    ut_ad(is_initialised());
+    size_t size= 0;
+    for (auto j= ut_min(n_chunks_new, n_chunks); j--; )
+      size+= chunks[j].size;
+    return size;
+  }
+
+  /** Determine whether a frame is intended to be withdrawn during resize().
+  @param ptr    pointer within a buf_page_t::frame
+  @return whether the frame will be withdrawn */
+  bool will_be_withdrawn(const byte *ptr) const
+  {
+    ut_ad(n_chunks_new < n_chunks);
+#ifdef SAFE_MUTEX
+    if (resize_in_progress())
+      mysql_mutex_assert_owner(&mutex);
+#endif /* SAFE_MUTEX */
+
+    for (const chunk_t *chunk= chunks + n_chunks_new,
+         * const echunk= chunks + n_chunks;
+         chunk != echunk; chunk++)
+      if (ptr >= chunk->blocks->page.frame &&
+          ptr < (chunk->blocks + chunk->size - 1)->page.frame + srv_page_size)
+        return true;
+    return false;
+  }
+
+  /** Determine whether a block is intended to be withdrawn during resize().
+  @param bpage  buffer pool block
+  @return whether the frame will be withdrawn */
+  bool will_be_withdrawn(const buf_page_t &bpage) const
+  {
+    ut_ad(n_chunks_new < n_chunks);
+#ifdef SAFE_MUTEX
+    if (resize_in_progress())
+      mysql_mutex_assert_owner(&mutex);
+#endif /* SAFE_MUTEX */
+
+    for (const chunk_t *chunk= chunks + n_chunks_new,
+         * const echunk= chunks + n_chunks;
+         chunk != echunk; chunk++)
+      if (&bpage >= &chunk->blocks->page &&
+          &bpage < &chunk->blocks[chunk->size].page)
+        return true;
+    return false;
+  }
+
+  /** Release and evict a corrupted page.
+  @param bpage    x-latched page that was found corrupted
+  @param state    expected current state of the page */
+  ATTRIBUTE_COLD void corrupted_evict(buf_page_t *bpage, uint32_t state);
+
+  /** Release a memory block to the buffer pool. */
+  ATTRIBUTE_COLD void free_block(buf_block_t *block);
+
+#ifdef UNIV_DEBUG
+  /** Find a block that points to a ROW_FORMAT=COMPRESSED page
+  @param data  pointer to the start of a ROW_FORMAT=COMPRESSED page frame
+  @return the block
+  @retval nullptr  if not found */
+  const buf_block_t *contains_zip(const void *data) const
+  {
+    mysql_mutex_assert_owner(&mutex);
+    for (const chunk_t *chunk= chunks, * const end= chunks + n_chunks;
+         chunk != end; chunk++)
+      if (const buf_block_t *block= chunk->contains_zip(data))
+        return block;
+    return nullptr;
+  }
+
+  /** Assert that all buffer pool pages are in a replaceable state */
+  void assert_all_freed();
+#endif /* UNIV_DEBUG */
+
+#ifdef BTR_CUR_HASH_ADAPT
+  /** Clear the adaptive hash index on all pages in the buffer pool. */
+  inline void clear_hash_index();
+
+  /** Get a buffer block from an adaptive hash index pointer.
+  This function does not return if the block is not identified.
+  @param ptr  pointer to within a page frame
+  @return pointer to block, never NULL */
+  inline buf_block_t *block_from_ahi(const byte *ptr) const;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+  /**
+  @return the smallest oldest_modification lsn for any page
+  @retval empty_lsn if all modified persistent pages have been flushed */
+  lsn_t get_oldest_modification(lsn_t empty_lsn)
+  {
+    mysql_mutex_assert_owner(&flush_list_mutex);
+    while (buf_page_t *bpage= UT_LIST_GET_LAST(flush_list))
+    {
+      ut_ad(!fsp_is_system_temporary(bpage->id().space()));
+      lsn_t lsn= bpage->oldest_modification();
+      if (lsn != 1)
+      {
+        ut_ad(lsn > 2);
+        return lsn;
+      }
+      delete_from_flush_list(bpage);
+    }
+    return empty_lsn;
+  }
+
+  /** Determine if a buffer block was created by chunk_t::create().
+  @param block  block descriptor (not dereferenced)
+  @return whether block has been created by chunk_t::create() */
+  bool is_uncompressed(const buf_block_t *block) const
+  {
+    return is_block_field(reinterpret_cast<const void*>(block));
+  }
+
+public:
+  /** @return whether the buffer pool contains a page
+  @tparam allow_watch  whether to allow watch_is_sentinel()
+  @param page_id       page identifier
+  @param chain         hash table chain for page_id.fold() */
+  template<bool allow_watch= false>
+  TRANSACTIONAL_INLINE
+  bool page_hash_contains(const page_id_t page_id, hash_chain &chain)
+  {
+    transactional_shared_lock_guard<page_hash_latch> g
+      {page_hash.lock_get(chain)};
+    buf_page_t *bpage= page_hash.get(page_id, chain);
+    if (bpage >= &watch[0] && bpage < &watch[UT_ARR_SIZE(watch)])
+    {
+      ut_ad(!bpage->in_zip_hash);
+      ut_ad(!bpage->zip.data);
+      if (!allow_watch)
+        bpage= nullptr;
+    }
+    return bpage;
+  }
+
+  /** Determine if a block is a sentinel for a buffer pool watch.
+  @param bpage page descriptor
+  @return whether bpage a sentinel for a buffer pool watch */
+  bool watch_is_sentinel(const buf_page_t &bpage)
+  {
+#ifdef SAFE_MUTEX
+    DBUG_ASSERT(mysql_mutex_is_owner(&mutex) ||
+                page_hash.lock_get(page_hash.cell_get(bpage.id().fold())).
+                is_locked());
+#endif /* SAFE_MUTEX */
+    ut_ad(bpage.in_file());
+    if (&bpage < &watch[0] || &bpage >= &watch[array_elements(watch)])
+      return false;
+    ut_ad(!bpage.in_zip_hash);
+    ut_ad(!bpage.zip.data);
+    return true;
+  }
+
+  /** Check if a watched page has been read.
+  This may only be called after !watch_set() and before invoking watch_unset().
+  @param id   page identifier
+  @return whether the page was read to the buffer pool */
+  TRANSACTIONAL_INLINE
+  bool watch_occurred(const page_id_t id)
+  {
+    hash_chain &chain= page_hash.cell_get(id.fold());
+    transactional_shared_lock_guard<page_hash_latch> g
+      {page_hash.lock_get(chain)};
+    /* The page must exist because watch_set() increments buf_fix_count. */
+    return !watch_is_sentinel(*page_hash.get(id, chain));
+  }
+
+  /** Register a watch for a page identifier.
+  @param id         page identifier
+  @param chain      page_hash.cell_get(id.fold())
+  @return a buffer page corresponding to id
+  @retval nullptr   if the block was not present in page_hash */
+  buf_page_t *watch_set(const page_id_t id, hash_chain &chain);
+
+  /** Stop watching whether a page has been read in.
+  watch_set(id) must have returned nullptr before.
+  @param id         page identifier
+  @param chain      unlocked hash table chain */
+  void watch_unset(const page_id_t id, hash_chain &chain);
+
+  /** Remove the sentinel block for the watch before replacing it with a
+  real block. watch_unset() or watch_occurred() will notice
+  that the block has been replaced with the real block.
+  @param w          sentinel
+  @param chain      locked hash table chain
+  @return           w->state() */
+  inline uint32_t watch_remove(buf_page_t *w, hash_chain &chain);
+
+  /** @return whether less than 1/4 of the buffer pool is available */
+  TPOOL_SUPPRESS_TSAN
+  bool running_out() const
+  {
+    return !recv_recovery_is_on() &&
+      UT_LIST_GET_LEN(free) + UT_LIST_GET_LEN(LRU) <
+        n_chunks_new / 4 * chunks->size;
+  }
+
+  /** @return whether the buffer pool has run out */
+  TPOOL_SUPPRESS_TSAN
+  bool ran_out() const
+  { return UNIV_UNLIKELY(!try_LRU_scan || !UT_LIST_GET_LEN(free)); }
+
+  /** @return whether the buffer pool is shrinking */
+  inline bool is_shrinking() const
+  {
+    return n_chunks_new < n_chunks;
+  }
+
+#ifdef UNIV_DEBUG
+  /** Validate the buffer pool. */
+  void validate();
+#endif /* UNIV_DEBUG */
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
+  /** Write information of the buf_pool to the error log. */
+  void print();
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
+
+  /** Remove a block from the LRU list.
+  @return the predecessor in the LRU list */
+  buf_page_t *LRU_remove(buf_page_t *bpage)
+  {
+    mysql_mutex_assert_owner(&mutex);
+    ut_ad(bpage->in_LRU_list);
+    ut_ad(bpage->in_page_hash);
+    ut_ad(!bpage->in_zip_hash);
+    ut_ad(bpage->in_file());
+    lru_hp.adjust(bpage);
+    lru_scan_itr.adjust(bpage);
+    ut_d(bpage->in_LRU_list= false);
+    buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage);
+    UT_LIST_REMOVE(LRU, bpage);
+    return prev;
+  }
+
+  /** Number of pages to read ahead */
+  static constexpr uint32_t READ_AHEAD_PAGES= 64;
+
+  /** Buffer pool mutex */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
+  /** current statistics; protected by mutex */
+  buf_pool_stat_t stat;
+  /** old statistics; protected by mutex */
+  buf_pool_stat_t old_stat;
+
+	/** @name General fields */
+	/* @{ */
+	ulint		curr_pool_size;	/*!< Current pool size in bytes */
+	ulint		LRU_old_ratio;  /*!< Reserve this much of the buffer
+					pool for "old" blocks */
+#ifdef UNIV_DEBUG
+	ulint		buddy_n_frames; /*!< Number of frames allocated from
+					the buffer pool to the buddy system */
+	ulint		mutex_exit_forbidden; /*!< Forbid release mutex */
+#endif
+	ut_allocator<unsigned char>	allocator;	/*!< Allocator used for
+					allocating memory for the the "chunks"
+					member. */
+	ulint		n_chunks;	/*!< number of buffer pool chunks */
+	ulint		n_chunks_new;	/*!< new number of buffer pool chunks.
+					both n_chunks{,new} are protected under
+					mutex */
+	chunk_t*	chunks;		/*!< buffer pool chunks */
+	chunk_t*	chunks_old;	/*!< old buffer pool chunks to be freed
+					after resizing buffer pool */
+	/** current pool size in pages */
+	Atomic_counter<ulint> curr_size;
+	/** read-ahead request size in pages */
+	Atomic_counter<uint32_t> read_ahead_area;
+
+  /** Hash table with singly-linked overflow lists */
+  struct page_hash_table
+  {
+    static_assert(CPU_LEVEL1_DCACHE_LINESIZE >= 64, "less than 64 bytes");
+    static_assert(!(CPU_LEVEL1_DCACHE_LINESIZE & 63),
+      "not a multiple of 64 bytes");
+
+    /** Number of array[] elements per page_hash_latch.
+    Must be one less than a power of 2. */
+    static constexpr size_t ELEMENTS_PER_LATCH= 64 / sizeof(void*) - 1;
+    static constexpr size_t EMPTY_SLOTS_PER_LATCH=
+      ((CPU_LEVEL1_DCACHE_LINESIZE / 64) - 1) * (64 / sizeof(void*));
+
+    /** number of payload elements in array[] */
+    Atomic_relaxed<ulint> n_cells;
+    /** the hash table, with pad(n_cells) elements, aligned to L1 cache size */
+    hash_chain *array;
+
+    /** Create the hash table.
+    @param n  the lower bound of n_cells */
+    void create(ulint n);
+
+    /** Free the hash table. */
+    void free() { aligned_free(array); array= nullptr; }
+
+    /** @return the index of an array element */
+    ulint calc_hash(ulint fold) const { return calc_hash(fold, n_cells); }
+    /** @return raw array index converted to padded index */
+    static ulint pad(ulint h)
+    {
+      ulint latches= h / ELEMENTS_PER_LATCH;
+      ulint empty_slots= latches * EMPTY_SLOTS_PER_LATCH;
+      return 1 + latches + empty_slots + h;
+    }
+  private:
+    /** @return the hash value before any ELEMENTS_PER_LATCH padding */
+    static ulint hash(ulint fold, ulint n) { return ut_hash_ulint(fold, n); }
+
+    /** @return the index of an array element */
+    static ulint calc_hash(ulint fold, ulint n_cells)
+    {
+      return pad(hash(fold, n_cells));
+    }
+  public:
+    /** @return the latch covering a hash table chain */
+    static page_hash_latch &lock_get(hash_chain &chain)
+    {
+      static_assert(!((ELEMENTS_PER_LATCH + 1) & ELEMENTS_PER_LATCH),
+                    "must be one less than a power of 2");
+      const size_t addr= reinterpret_cast<size_t>(&chain);
+      ut_ad(addr & (ELEMENTS_PER_LATCH * sizeof chain));
+      return *reinterpret_cast<page_hash_latch*>
+        (addr & ~(ELEMENTS_PER_LATCH * sizeof chain));
+    }
+
+    /** Get a hash table slot. */
+    hash_chain &cell_get(ulint fold) const
+    { return array[calc_hash(fold, n_cells)]; }
+
+    /** Append a block descriptor to a hash bucket chain. */
+    void append(hash_chain &chain, buf_page_t *bpage)
+    {
+      ut_ad(!bpage->in_page_hash);
+      ut_ad(!bpage->hash);
+      ut_d(bpage->in_page_hash= true);
+      buf_page_t **prev= &chain.first;
+      while (*prev)
+      {
+        ut_ad((*prev)->in_page_hash);
+        prev= &(*prev)->hash;
+      }
+      *prev= bpage;
+    }
+
+    /** Remove a block descriptor from a hash bucket chain. */
+    void remove(hash_chain &chain, buf_page_t *bpage)
+    {
+      ut_ad(bpage->in_page_hash);
+      buf_page_t **prev= &chain.first;
+      while (*prev != bpage)
+      {
+        ut_ad((*prev)->in_page_hash);
+        prev= &(*prev)->hash;
+      }
+      *prev= bpage->hash;
+      ut_d(bpage->in_page_hash= false);
+      bpage->hash= nullptr;
+    }
+
+    /** Replace a block descriptor with another. */
+    void replace(hash_chain &chain, buf_page_t *old, buf_page_t *bpage)
+    {
+      ut_ad(old->in_page_hash);
+      ut_ad(bpage->in_page_hash);
+      ut_d(old->in_page_hash= false);
+      ut_ad(bpage->hash == old->hash);
+      old->hash= nullptr;
+      buf_page_t **prev= &chain.first;
+      while (*prev != old)
+      {
+        ut_ad((*prev)->in_page_hash);
+        prev= &(*prev)->hash;
+      }
+      *prev= bpage;
+    }
+
+    /** Look up a page in a hash bucket chain. */
+    inline buf_page_t *get(const page_id_t id, const hash_chain &chain) const;
+
+    /** Exclusively aqcuire all latches */
+    inline void write_lock_all();
+
+    /** Release all latches */
+    inline void write_unlock_all();
+  };
+
+  /** Hash table of file pages (buf_page_t::in_file() holds),
+  indexed by page_id_t. Protected by both mutex and page_hash.lock_get(). */
+  page_hash_table page_hash;
+
+  /** map of block->frame to buf_block_t blocks that belong
+  to buf_buddy_alloc(); protected by buf_pool.mutex */
+  hash_table_t zip_hash;
+	Atomic_counter<ulint>
+			n_pend_unzip;	/*!< number of pending decompressions */
+
+	time_t		last_printout_time;
+					/*!< when buf_print_io was last time
+					called */
+	buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES_MAX + 1];
+					/*!< Statistics of buddy system,
+					indexed by block size */
+
+	/* @} */
+
+  /** number of index page splits */
+  Atomic_counter<ulint> pages_split;
+
+  /** @name Page flushing algorithm fields */
+  /* @{ */
+
+  /** mutex protecting flush_list, buf_page_t::set_oldest_modification()
+  and buf_page_t::list pointers when !oldest_modification() */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_list_mutex;
+  /** "hazard pointer" for flush_list scans; protected by flush_list_mutex */
+  FlushHp flush_hp;
+  /** flush_list size in bytes; protected by flush_list_mutex */
+  ulint flush_list_bytes;
+  /** possibly modified persistent pages (a subset of LRU);
+  os_aio_pending_writes() is approximately COUNT(is_write_fixed()) */
+  UT_LIST_BASE_NODE_T(buf_page_t) flush_list;
+  /** number of blocks ever added to flush_list;
+  sometimes protected by flush_list_mutex */
+  size_t flush_list_requests;
+
+  TPOOL_SUPPRESS_TSAN void add_flush_list_requests(size_t size)
+  { ut_ad(size); flush_list_requests+= size; }
+private:
+  static constexpr unsigned PAGE_CLEANER_IDLE= 1;
+  static constexpr unsigned FLUSH_LIST_ACTIVE= 2;
+  static constexpr unsigned LRU_FLUSH= 4;
+
+  /** Number of pending LRU flush * LRU_FLUSH +
+  PAGE_CLEANER_IDLE + FLUSH_LIST_ACTIVE flags */
+  unsigned page_cleaner_status;
+  /** track server activity count for signaling idle flushing */
+  ulint last_activity_count;
+public:
+  /** signalled to wake up the page_cleaner; protected by flush_list_mutex */
+  pthread_cond_t do_flush_list;
+  /** broadcast when !n_flush(); protected by flush_list_mutex */
+  pthread_cond_t done_flush_LRU;
+  /** broadcast when a batch completes; protected by flush_list_mutex */
+  pthread_cond_t done_flush_list;
+
+  /** @return number of pending LRU flush */
+  unsigned n_flush() const
+  {
+    mysql_mutex_assert_owner(&flush_list_mutex);
+    return page_cleaner_status / LRU_FLUSH;
+  }
+
+  /** Increment the number of pending LRU flush */
+  inline void n_flush_inc();
+
+  /** Decrement the number of pending LRU flush */
+  inline void n_flush_dec();
+
+  /** Decrement the number of pending LRU flush
+  while holding flush_list_mutex */
+  inline void n_flush_dec_holding_mutex();
+
+  /** @return whether flush_list flushing is active */
+  bool flush_list_active() const
+  {
+    mysql_mutex_assert_owner(&flush_list_mutex);
+    return page_cleaner_status & FLUSH_LIST_ACTIVE;
+  }
+
+  void flush_list_set_active()
+  {
+    ut_ad(!flush_list_active());
+    page_cleaner_status+= FLUSH_LIST_ACTIVE;
+  }
+  void flush_list_set_inactive()
+  {
+    ut_ad(flush_list_active());
+    page_cleaner_status-= FLUSH_LIST_ACTIVE;
+  }
+
+  /** @return whether the page cleaner must sleep due to being idle */
+  bool page_cleaner_idle() const noexcept
+  {
+    mysql_mutex_assert_owner(&flush_list_mutex);
+    return page_cleaner_status & PAGE_CLEANER_IDLE;
+  }
+
+  /** @return whether the page cleaner may be initiating writes */
+  bool page_cleaner_active() const
+  {
+    mysql_mutex_assert_owner(&flush_list_mutex);
+    static_assert(PAGE_CLEANER_IDLE == 1, "efficiency");
+    return page_cleaner_status > PAGE_CLEANER_IDLE;
+  }
+
+  /** Wake up the page cleaner if needed.
+  @param for_LRU  whether to wake up for LRU eviction */
+  void page_cleaner_wakeup(bool for_LRU= false);
+
+  /** Register whether an explicit wakeup of the page cleaner is needed */
+  void page_cleaner_set_idle(bool deep_sleep)
+  {
+    mysql_mutex_assert_owner(&flush_list_mutex);
+    page_cleaner_status= (page_cleaner_status & ~PAGE_CLEANER_IDLE) |
+      (PAGE_CLEANER_IDLE * deep_sleep);
+  }
+
+  /** Update server last activity count */
+  void update_last_activity_count(ulint activity_count)
+  {
+    mysql_mutex_assert_owner(&flush_list_mutex);
+    last_activity_count= activity_count;
+  }
+
+	unsigned	freed_page_clock;/*!< a sequence number used
+					to count the number of buffer
+					blocks removed from the end of
+					the LRU list; NOTE that this
+					counter may wrap around at 4
+					billion! A thread is allowed
+					to read this for heuristic
+					purposes without holding any
+					mutex or latch */
+  /** Cleared when buf_LRU_get_free_block() fails.
+  Set whenever the free list grows, along with a broadcast of done_free.
+  Protected by buf_pool.mutex. */
+  Atomic_relaxed<bool> try_LRU_scan;
+	/* @} */
+
+	/** @name LRU replacement algorithm fields */
+	/* @{ */
+
+	UT_LIST_BASE_NODE_T(buf_page_t) free;
+					/*!< base node of the free
+					block list */
+  /** broadcast each time when the free list grows or try_LRU_scan is set;
+  protected by mutex */
+  pthread_cond_t done_free;
+
+	UT_LIST_BASE_NODE_T(buf_page_t) withdraw;
+					/*!< base node of the withdraw
+					block list. It is only used during
+					shrinking buffer pool size, not to
+					reuse the blocks will be removed */
+
+	ulint		withdraw_target;/*!< target length of withdraw
+					block list, when withdrawing */
+
+	/** "hazard pointer" used during scan of LRU while doing
+	LRU list batch.  Protected by buf_pool_t::mutex. */
+	LRUHp		lru_hp;
+
+	/** Iterator used to scan the LRU list when searching for
+	replacable victim. Protected by buf_pool_t::mutex. */
+	LRUItr		lru_scan_itr;
+
+	UT_LIST_BASE_NODE_T(buf_page_t) LRU;
+					/*!< base node of the LRU list */
+
+	buf_page_t*	LRU_old;	/*!< pointer to the about
+					LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
+					oldest blocks in the LRU list;
+					NULL if LRU length less than
+					BUF_LRU_OLD_MIN_LEN;
+					NOTE: when LRU_old != NULL, its length
+					should always equal LRU_old_len */
+	ulint		LRU_old_len;	/*!< length of the LRU list from
+					the block to which LRU_old points
+					onward, including that block;
+					see buf0lru.cc for the restrictions
+					on this value; 0 if LRU_old == NULL;
+					NOTE: LRU_old_len must be adjusted
+					whenever LRU_old shrinks or grows! */
+
+	UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU;
+					/*!< base node of the
+					unzip_LRU list */
+
+	/* @} */
+  /** free ROW_FORMAT=COMPRESSED page frames */
+  UT_LIST_BASE_NODE_T(buf_buddy_free_t) zip_free[BUF_BUDDY_SIZES_MAX];
+#if BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN
+# error "BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN"
+#endif
+
+  /** Sentinels to detect if pages are read into the buffer pool while
+  a delete-buffering operation is pending. Protected by mutex. */
+  buf_page_t watch[innodb_purge_threads_MAX + 1];
+  /** Reserve a buffer. */
+  buf_tmp_buffer_t *io_buf_reserve() { return io_buf.reserve(); }
+
+  /** Remove a block from flush_list.
+  @param bpage   buffer pool page */
+  void delete_from_flush_list(buf_page_t *bpage) noexcept;
+
+  /** Prepare to insert a modified blcok into flush_list.
+  @param lsn start LSN of the mini-transaction
+  @return insert position for insert_into_flush_list() */
+  inline buf_page_t *prepare_insert_into_flush_list(lsn_t lsn) noexcept;
+
+  /** Insert a modified block into the flush list.
+  @param prev     insert position (from prepare_insert_into_flush_list())
+  @param block    modified block
+  @param lsn      start LSN of the mini-transaction that modified the block */
+  inline void insert_into_flush_list(buf_page_t *prev, buf_block_t *block,
+                                     lsn_t lsn) noexcept;
+
+  /** Free a page whose underlying file page has been freed. */
+  ATTRIBUTE_COLD void release_freed_page(buf_page_t *bpage) noexcept;
+
+private:
+  /** Temporary memory for page_compressed and encrypted I/O */
+  struct io_buf_t
+  {
+    /** number of elements in slots[] */
+    ulint n_slots;
+    /** array of slots */
+    buf_tmp_buffer_t *slots;
+
+    void create(ulint n_slots);
+
+    void close();
+
+    /** Reserve a buffer */
+    buf_tmp_buffer_t *reserve();
+  } io_buf;
+
+  /** whether resize() is in the critical path */
+  std::atomic<bool> resizing;
+};
+
+/** The InnoDB buffer pool */
+extern buf_pool_t buf_pool;
+
+inline buf_page_t *buf_pool_t::page_hash_table::get(const page_id_t id,
+                                                    const hash_chain &chain)
+  const
+{
+#ifdef SAFE_MUTEX
+  DBUG_ASSERT(mysql_mutex_is_owner(&buf_pool.mutex) ||
+              lock_get(const_cast<hash_chain&>(chain)).is_locked());
+#endif /* SAFE_MUTEX */
+  for (buf_page_t *bpage= chain.first; bpage; bpage= bpage->hash)
+  {
+    ut_ad(bpage->in_page_hash);
+    ut_ad(bpage->in_file());
+    if (bpage->id() == id)
+      return bpage;
+  }
+  return nullptr;
+}
+
+#ifdef SUX_LOCK_GENERIC
+inline void page_hash_latch::lock_shared()
+{
+  mysql_mutex_assert_not_owner(&buf_pool.mutex);
+  if (!read_trylock())
+    read_lock_wait();
+}
+
+inline void page_hash_latch::lock()
+{
+  if (!write_trylock())
+    write_lock_wait();
+}
+#endif /* SUX_LOCK_GENERIC */
+
+inline void buf_page_t::set_state(uint32_t s)
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  ut_ad(s <= REMOVE_HASH || s >= UNFIXED);
+  ut_ad(s < WRITE_FIX);
+  ut_ad(s <= READ_FIX || zip.fix == READ_FIX);
+  zip.fix= s;
+}
+
+inline void buf_page_t::set_corrupt_id()
+{
+#ifdef UNIV_DEBUG
+  switch (oldest_modification()) {
+  case 0:
+    break;
+  case 2:
+    ut_ad(fsp_is_system_temporary(id().space()));
+    /* buf_LRU_block_free_non_file_page() asserts !oldest_modification() */
+    ut_d(oldest_modification_= 0;)
+    break;
+  default:
+    ut_ad("block is dirty" == 0);
+  }
+  const auto f= state();
+  if (f != REMOVE_HASH)
+  {
+    ut_ad(f >= UNFIXED);
+    ut_ad(buf_pool.page_hash.lock_get(buf_pool.page_hash.cell_get(id_.fold())).
+          is_write_locked());
+  }
+#endif
+  id_.set_corrupted();
+}
+
+/** Set oldest_modification when adding to buf_pool.flush_list */
+inline void buf_page_t::set_oldest_modification(lsn_t lsn)
+{
+  mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+  ut_ad(oldest_modification() <= 1);
+  ut_ad(lsn > 2);
+  oldest_modification_= lsn;
+}
+
+/** Clear oldest_modification after removing from buf_pool.flush_list */
+inline void buf_page_t::clear_oldest_modification()
+{
+#ifdef SAFE_MUTEX
+  if (oldest_modification() != 2)
+    mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+#endif /* SAFE_MUTEX */
+  ut_d(const auto s= state());
+  ut_ad(s >= REMOVE_HASH);
+  ut_ad(oldest_modification());
+  ut_ad(!list.prev);
+  ut_ad(!list.next);
+  /* We must use release memory order to guarantee that callers of
+  oldest_modification_acquire() will observe the block as
+  being detached from buf_pool.flush_list, after reading the value 0. */
+  oldest_modification_.store(0, std::memory_order_release);
+}
+
+/** @return whether the block can be relocated in memory.
+The block can be dirty, but it must not be I/O-fixed or bufferfixed. */
+inline bool buf_page_t::can_relocate() const
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  const auto f= state();
+  ut_ad(f >= FREED);
+  ut_ad(in_LRU_list);
+  return (f == FREED || (f < READ_FIX && !(f & ~LRU_MASK))) &&
+    !lock.is_locked_or_waiting();
+}
+
+/** @return whether the block has been flagged old in buf_pool.LRU */
+inline bool buf_page_t::is_old() const
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  ut_ad(in_file());
+  ut_ad(in_LRU_list);
+  return old;
+}
+
+/** Set whether a block is old in buf_pool.LRU */
+inline void buf_page_t::set_old(bool old)
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  ut_ad(in_LRU_list);
+
+#ifdef UNIV_LRU_DEBUG
+  ut_a((buf_pool.LRU_old_len == 0) == (buf_pool.LRU_old == nullptr));
+  /* If a block is flagged "old", the LRU_old list must exist. */
+  ut_a(!old || buf_pool.LRU_old);
+
+  if (UT_LIST_GET_PREV(LRU, this) && UT_LIST_GET_NEXT(LRU, this))
+  {
+    const buf_page_t *prev= UT_LIST_GET_PREV(LRU, this);
+    const buf_page_t *next = UT_LIST_GET_NEXT(LRU, this);
+    if (prev->old == next->old)
+      ut_a(prev->old == old);
+    else
+    {
+      ut_a(!prev->old);
+      ut_a(buf_pool.LRU_old == (old ? this : next));
+    }
+  }
+#endif /* UNIV_LRU_DEBUG */
+
+  this->old= old;
+}
+
+#ifdef UNIV_DEBUG
+/** Forbid the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_forbid() do {		\
+	mysql_mutex_assert_owner(&buf_pool.mutex);	\
+	buf_pool.mutex_exit_forbidden++;		\
+} while (0)
+/** Allow the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_allow() do {		\
+	mysql_mutex_assert_owner(&buf_pool.mutex);	\
+	ut_ad(buf_pool.mutex_exit_forbidden--);		\
+} while (0)
+#else
+/** Forbid the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_forbid() ((void) 0)
+/** Allow the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_allow() ((void) 0)
+#endif
+
+/**********************************************************************
+Let us list the consistency conditions for different control block states.
+
+NOT_USED:	is in free list, not LRU, not flush_list, nor page_hash
+MEMORY:		is not in any of free, LRU, flush_list, page_hash
+in_file():	is not in free list, is in LRU list, id() is defined,
+		is in page_hash (not necessarily if is_read_fixed())
+
+		is in buf_pool.flush_list, if and only
+		if oldest_modification == 1 || oldest_modification > 2
+
+		(1) if is_write_fixed(): is u-locked
+		(2) if is_read_fixed(): is x-locked
+
+State transitions:
+
+NOT_USED => MEMORY
+MEMORY => NOT_USED
+MEMORY => UNFIXED
+UNFIXED => in_file()
+in_file() => UNFIXED or FREED
+UNFIXED or FREED => REMOVE_HASH
+REMOVE_HASH => NOT_USED	(if and only if !oldest_modification())
+*/
+
+/** Select from where to start a scan. If we have scanned
+too deep into the LRU list it resets the value to the tail
+of the LRU list.
+@return buf_page_t from where to start scan. */
+inline buf_page_t *LRUItr::start()
+{
+  mysql_mutex_assert_owner(m_mutex);
+
+  if (!m_hp || m_hp->old)
+    m_hp= UT_LIST_GET_LAST(buf_pool.LRU);
+
+  return m_hp;
+}
+
+#ifdef UNIV_DEBUG
+/** Functor to validate the LRU list. */
+struct	CheckInLRUList {
+	void	operator()(const buf_page_t* elem) const
+	{
+		ut_a(elem->in_LRU_list);
+	}
+
+	static void validate()
+	{
+		ut_list_validate(buf_pool.LRU, CheckInLRUList());
+	}
+};
+
+/** Functor to validate the LRU list. */
+struct	CheckInFreeList {
+	void	operator()(const buf_page_t* elem) const
+	{
+		ut_a(elem->in_free_list);
+	}
+
+	static void validate()
+	{
+		ut_list_validate(buf_pool.free, CheckInFreeList());
+	}
+};
+
+struct	CheckUnzipLRUAndLRUList {
+	void	operator()(const buf_block_t* elem) const
+	{
+                ut_a(elem->page.in_LRU_list);
+                ut_a(elem->in_unzip_LRU_list);
+	}
+
+	static void validate()
+	{
+		ut_list_validate(buf_pool.unzip_LRU,
+				 CheckUnzipLRUAndLRUList());
+	}
+};
+#endif /* UNIV_DEBUG */
+
+#include "buf0buf.inl"
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/buf0buf.inl b/storage/innobase/include/buf0buf.inl
new file mode 100644
index 00000000..b3158cf1
--- /dev/null
+++ b/storage/innobase/include/buf0buf.inl
@@ -0,0 +1,132 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buf.ic
+The database buffer buf_pool
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0lru.h"
+
+/** Determine if a block is still close enough to the MRU end of the LRU list
+meaning that it is not in danger of getting evicted and also implying
+that it has been accessed recently.
+The page must be either buffer-fixed, or its page hash must be locked.
+@param[in]	bpage		buffer pool page
+@return whether bpage is close to MRU end of LRU */
+inline bool buf_page_peek_if_young(const buf_page_t *bpage)
+{
+	/* FIXME: bpage->freed_page_clock is 31 bits */
+	return((buf_pool.freed_page_clock & ((1UL << 31) - 1))
+	       < (bpage->freed_page_clock
+		  + (buf_pool.curr_size
+		     * (BUF_LRU_OLD_RATIO_DIV - buf_pool.LRU_old_ratio)
+		     / (BUF_LRU_OLD_RATIO_DIV * 4))));
+}
+
+/** Determine if a block should be moved to the start of the LRU list if
+there is danger of dropping from the buffer pool.
+@param[in]	bpage		buffer pool page
+@return true if bpage should be made younger */
+inline bool buf_page_peek_if_too_old(const buf_page_t *bpage)
+{
+	if (buf_pool.freed_page_clock == 0) {
+		/* If eviction has not started yet, do not update the
+		statistics or move blocks in the LRU list.  This is
+		either the warm-up phase or an in-memory workload. */
+		return(FALSE);
+	} else if (buf_LRU_old_threshold_ms && bpage->old) {
+		uint32_t access_time = bpage->is_accessed();
+
+		/* It is possible that the below comparison returns an
+		unexpected result. 2^32 milliseconds pass in about 50 days,
+		so if the difference between ut_time_ms() and access_time
+		is e.g. 50 days + 15 ms, then the below will behave as if
+		it is 15 ms. This is known and fixing it would require to
+		increase buf_page_t::access_time from 32 to 64 bits. */
+		if (access_time
+		    && ((ib_uint32_t) (ut_time_ms() - access_time))
+		    >= buf_LRU_old_threshold_ms) {
+			return(TRUE);
+		}
+
+		buf_pool.stat.n_pages_not_made_young++;
+		return false;
+	} else {
+		return !buf_page_peek_if_young(bpage);
+	}
+}
+
+/** Allocate a buffer block.
+@return own: the allocated block, in state BUF_BLOCK_MEMORY */
+inline buf_block_t *buf_block_alloc()
+{
+  return buf_LRU_get_free_block(false);
+}
+
+/********************************************************************//**
+Frees a buffer block which does not contain a file page. */
+UNIV_INLINE
+void
+buf_block_free(
+/*===========*/
+	buf_block_t*	block)	/*!< in, own: block to be freed */
+{
+	mysql_mutex_lock(&buf_pool.mutex);
+	buf_LRU_block_free_non_file_page(block);
+	mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/********************************************************************//**
+Increments the modify clock of a frame by 1. The caller must (1) own the
+buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+void
+buf_block_modify_clock_inc(
+/*=======================*/
+	buf_block_t*	block)	/*!< in: block */
+{
+#ifdef SAFE_MUTEX
+	ut_ad((mysql_mutex_is_owner(&buf_pool.mutex)
+	       && !block->page.buf_fix_count())
+	      || block->page.lock.have_u_or_x());
+#else /* SAFE_MUTEX */
+	ut_ad(!block->page.buf_fix_count() || block->page.lock.have_u_or_x());
+#endif /* SAFE_MUTEX */
+	assert_block_ahi_valid(block);
+
+	block->modify_clock++;
+}
+
+/********************************************************************//**
+Returns the value of the modify clock. The caller must have an s-lock
+or x-lock on the block.
+@return value */
+UNIV_INLINE
+ib_uint64_t
+buf_block_get_modify_clock(
+/*=======================*/
+	buf_block_t*	block)	/*!< in: block */
+{
+	ut_ad(block->page.lock.have_any());
+	return(block->modify_clock);
+}
diff --git a/storage/innobase/include/buf0checksum.h b/storage/innobase/include/buf0checksum.h
new file mode 100644
index 00000000..d9f03177
--- /dev/null
+++ b/storage/innobase/include/buf0checksum.h
@@ -0,0 +1,57 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0checksum.h
+Buffer pool checksum functions, also linked from /extra/innochecksum.cc
+
+Created Aug 11, 2011 Vasil Dimov
+*******************************************************/
+
+#pragma once
+#include "buf0types.h"
+
+/** Calculate the CRC32 checksum of a page. The value is stored to the page
+when it is written to a file and also checked for a match when reading from
+the file. Note that we must be careful to calculate the same value on all
+architectures.
+@param[in]	page			buffer page (srv_page_size bytes)
+@return	CRC-32C */
+uint32_t buf_calc_page_crc32(const byte* page);
+
+#ifndef UNIV_INNOCHECKSUM
+/** Calculate a checksum which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@param[in]	page	file page (srv_page_size bytes)
+@return checksum */
+uint32_t
+buf_calc_page_new_checksum(const byte* page);
+
+/** In MySQL before 4.0.14 or 4.1.1 there was an InnoDB bug that
+the checksum only looked at the first few bytes of the page.
+This calculates that old checksum.
+NOTE: we must first store the new formula checksum to
+FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
+because this takes that field as an input!
+@param[in]	page	file page (srv_page_size bytes)
+@return checksum */
+uint32_t
+buf_calc_page_old_checksum(const byte* page);
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h
new file mode 100644
index 00000000..9932b0e5
--- /dev/null
+++ b/storage/innobase/include/buf0dblwr.h
@@ -0,0 +1,164 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0dblwr.h
+Doublewrite buffer module
+
+Created 2011/12/19 Inaam Rana
+*******************************************************/
+
+#pragma once
+
+#include "os0file.h"
+#include "buf0types.h"
+
+/** Doublewrite control struct */
+class buf_dblwr_t
+{
+  struct element
+  {
+    /** asynchronous write request */
+    IORequest request;
+    /** payload size in bytes */
+    size_t size;
+  };
+
+  struct slot
+  {
+    /** first free position in write_buf measured in units of
+     * srv_page_size */
+    ulint first_free;
+    /** number of slots reserved for the current write batch */
+    ulint reserved;
+    /** the doublewrite buffer, aligned to srv_page_size */
+    byte* write_buf;
+    /** buffer blocks to be written via write_buf */
+    element* buf_block_arr;
+  };
+
+  /** the page number of the first doublewrite block (block_size() pages) */
+  page_id_t block1{0, 0};
+  /** the page number of the second doublewrite block (block_size() pages) */
+  page_id_t block2{0, 0};
+
+  /** mutex protecting the data members below */
+  mysql_mutex_t mutex;
+  /** condition variable for !batch_running */
+  pthread_cond_t cond;
+  /** whether a batch is being written from the doublewrite buffer */
+  bool batch_running;
+  /** number of expected flush_buffered_writes_completed() calls */
+  unsigned flushing_buffered_writes;
+  /** number of flush_buffered_writes_completed() calls */
+  ulint writes_completed;
+  /** number of pages written by flush_buffered_writes_completed() */
+  ulint pages_written;
+
+  slot slots[2];
+  slot *active_slot;
+
+  /** Initialise the persistent storage of the doublewrite buffer.
+  @param header   doublewrite page header in the TRX_SYS page */
+  inline void init(const byte *header);
+
+  /** Flush possible buffered writes to persistent storage. */
+  bool flush_buffered_writes(const ulint size);
+
+public:
+  /** Initialise the doublewrite buffer data structures. */
+  void init();
+  /** Create or restore the doublewrite buffer in the TRX_SYS page.
+  @return whether the operation succeeded */
+  bool create();
+  /** Free the doublewrite buffer. */
+  void close();
+
+  /** Acquire the mutex */
+  void lock() { mysql_mutex_lock(&mutex); }
+  /** @return the number of completed batches */
+  ulint batches() const
+  { mysql_mutex_assert_owner(&mutex); return writes_completed; }
+  /** @return the number of final pages written */
+  ulint written() const
+  { mysql_mutex_assert_owner(&mutex); return pages_written; }
+  /** Release the mutex */
+  void unlock() { mysql_mutex_unlock(&mutex); }
+
+  /** Initialize the doublewrite buffer memory structure on recovery.
+  If we are upgrading from a version before MySQL 4.1, then this
+  function performs the necessary update operations to support
+  innodb_file_per_table. If we are in a crash recovery, this function
+  loads the pages from double write buffer into memory.
+  @param file File handle
+  @param path Path name of file
+  @return DB_SUCCESS or error code */
+  dberr_t init_or_load_pages(pfs_os_file_t file, const char *path);
+
+  /** Process and remove the double write buffer pages for all tablespaces. */
+  void recover();
+
+  /** Update the doublewrite buffer on data page write completion. */
+  void write_completed();
+  /** Flush possible buffered writes to persistent storage.
+  It is very important to call this function after a batch of writes has been
+  posted, and also when we may have to wait for a page latch!
+  Otherwise a deadlock of threads can occur. */
+  void flush_buffered_writes();
+  /** Update the doublewrite buffer on write batch completion
+  @param request  the completed batch write request */
+  void flush_buffered_writes_completed(const IORequest &request);
+
+  /** Size of the doublewrite block in pages */
+  uint32_t block_size() const { return FSP_EXTENT_SIZE; }
+
+  /** Schedule a page write. If the doublewrite memory buffer is full,
+  flush_buffered_writes() will be invoked to make space.
+  @param request    asynchronous write request
+  @param size       payload size in bytes */
+  void add_to_batch(const IORequest &request, size_t size);
+
+  /** Determine whether the doublewrite buffer has been created */
+  bool is_created() const
+  { return UNIV_LIKELY(block1 != page_id_t(0, 0)); }
+
+  /** @return whether a page identifier is part of the doublewrite buffer */
+  bool is_inside(const page_id_t id) const
+  {
+    if (!is_created())
+      return false;
+    ut_ad(block1 < block2);
+    if (id < block1)
+      return false;
+    const uint32_t size= block_size();
+    return id < block1 + size || (id >= block2 && id < block2 + size);
+  }
+
+  /** Wait for flush_buffered_writes() to be fully completed */
+  void wait_flush_buffered_writes()
+  {
+    mysql_mutex_lock(&mutex);
+    while (batch_running)
+      my_cond_wait(&cond, &mutex.m_mutex);
+    mysql_mutex_unlock(&mutex);
+  }
+};
+
+/** The doublewrite buffer */
+extern buf_dblwr_t buf_dblwr;
diff --git a/storage/innobase/include/buf0dump.h b/storage/innobase/include/buf0dump.h
new file mode 100644
index 00000000..48586900
--- /dev/null
+++ b/storage/innobase/include/buf0dump.h
@@ -0,0 +1,44 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dump.h
+Implements a buffer pool dump/load.
+
+Created April 08, 2011 Vasil Dimov
+*******************************************************/
+
+#ifndef buf0dump_h
+#define buf0dump_h
+
+/** Start the buffer pool dump/load task and instructs it to start a dump. */
+void buf_dump_start();
+/** Start the buffer pool dump/load task and instructs it to start a load. */
+void buf_load_start();
+
+/** Abort a currently running buffer pool load. */
+void buf_load_abort();
+
+/** Start async buffer pool load, if srv_buffer_pool_load_at_startup was set.*/
+void buf_load_at_startup();
+
+/** Wait for currently running load/dumps to finish*/
+void buf_load_dump_end();
+
+#endif /* buf0dump_h */
diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h
new file mode 100644
index 00000000..0cce514b
--- /dev/null
+++ b/storage/innobase/include/buf0flu.h
@@ -0,0 +1,125 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0flu.h
+The database buffer pool flush algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "ut0byte.h"
+#include "log0log.h"
+#include "buf0buf.h"
+
+/** Number of pages flushed via LRU. Protected by buf_pool.mutex.
+Also included in buf_pool.stat.n_pages_written. */
+extern ulint buf_lru_flush_page_count;
+/** Number of pages freed without flushing. Protected by buf_pool.mutex. */
+extern ulint buf_lru_freed_page_count;
+
+/** Flag indicating if the page_cleaner is in active state. */
+extern Atomic_relaxed<bool> buf_page_cleaner_is_active;
+
+/** Remove all dirty pages belonging to a given tablespace when we are
+deleting the data file of that tablespace.
+The pages still remain a part of LRU and are evicted from
+the list as they age towards the tail of the LRU.
+@param id    tablespace identifier */
+void buf_flush_remove_pages(uint32_t id);
+
+/*******************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage has already been
+copied to dpage. */
+ATTRIBUTE_COLD
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+	buf_page_t*	bpage,	/*!< in/out: control block being moved */
+	buf_page_t*	dpage);	/*!< in/out: destination block */
+
+/** Complete write of a file page from buf_pool.
+@param request write request
+@param error   whether the write may have failed */
+void buf_page_write_complete(const IORequest &request, bool error);
+
+/** Assign the full crc32 checksum for non-compressed page.
+@param[in,out]	page	page to be updated */
+void buf_flush_assign_full_crc32_checksum(byte* page);
+
+/** Initialize a page for writing to the tablespace.
+@param[in]	block			buffer block; NULL if bypassing the buffer pool
+@param[in,out]	page			page frame
+@param[in,out]	page_zip_		compressed page, or NULL if uncompressed
+@param[in]	use_full_checksum	whether tablespace uses full checksum */
+void
+buf_flush_init_for_writing(
+	const buf_block_t*	block,
+	byte*			page,
+	void*			page_zip_,
+	bool			use_full_checksum);
+
+/** Try to flush dirty pages that belong to a given tablespace.
+@param space       tablespace
+@param n_flushed   number of pages written
+@return whether the flush for some pages might not have been initiated */
+bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed= nullptr)
+  MY_ATTRIBUTE((warn_unused_result));
+
+/** Write out dirty blocks from buf_pool.LRU,
+and move clean blocks to buf_pool.free.
+The caller must invoke buf_dblwr.flush_buffered_writes()
+after releasing buf_pool.mutex.
+@param max_n    wished maximum mumber of blocks flushed
+@param evict    whether to evict pages after flushing
+@return evict ? number of processed pages : number of pages written
+@retval 0 if a buf_pool.LRU batch is already running */
+ulint buf_flush_LRU(ulint max_n, bool evict);
+
+/** Wait until a LRU flush batch ends. */
+void buf_flush_wait_LRU_batch_end();
+/** Wait until all persistent pages are flushed up to a limit.
+@param sync_lsn   buf_pool.get_oldest_modification(LSN_MAX) to wait for */
+ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn);
+/** Initiate more eager page flushing if the log checkpoint age is too old.
+@param lsn      buf_pool.get_oldest_modification(LSN_MAX) target
+@param furious  true=furious flushing, false=limit to innodb_io_capacity */
+ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious);
+
+/** Initialize page_cleaner. */
+ATTRIBUTE_COLD void buf_flush_page_cleaner_init();
+
+/** Flush the buffer pool on shutdown. */
+ATTRIBUTE_COLD void buf_flush_buffer_pool();
+
+#ifdef UNIV_DEBUG
+/** Validate the flush list. */
+void buf_flush_validate();
+#endif /* UNIV_DEBUG */
+
+/** Synchronously flush dirty blocks during recv_sys_t::apply().
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+void buf_flush_sync_batch(lsn_t lsn);
+
+/** Synchronously flush dirty blocks.
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+void buf_flush_sync();
diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h
new file mode 100644
index 00000000..aec08e77
--- /dev/null
+++ b/storage/innobase/include/buf0lru.h
@@ -0,0 +1,193 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0lru.h
+The database buffer pool LRU replacement algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "buf0types.h"
+#include "hash0hash.h"
+
+// Forward declaration
+struct trx_t;
+struct fil_space_t;
+
+/** Flush this many pages in buf_LRU_get_free_block() */
+extern size_t innodb_lru_flush_size;
+
+/*#######################################################################
+These are low-level functions
+#########################################################################*/
+
+/** Minimum LRU list length for which the LRU_old pointer is defined */
+#define BUF_LRU_OLD_MIN_LEN	512	/* 8 megabytes of 16k pages */
+
+/** Try to free a block. If bpage is a descriptor of a compressed-only
+ROW_FORMAT=COMPRESSED page, the buf_page_t object will be freed as well.
+The caller must hold buf_pool.mutex.
+@param bpage      block to be freed
+@param zip        whether to remove both copies of a ROW_FORMAT=COMPRESSED page
+@retval true if freed and buf_pool.mutex may have been temporarily released
+@retval false if the page was not freed */
+bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
+  MY_ATTRIBUTE((nonnull));
+
+/** Try to free a replaceable block.
+@param limit  maximum number of blocks to scan
+@return true if found and freed */
+bool buf_LRU_scan_and_free_block(ulint limit= ULINT_UNDEFINED);
+
+/** @return a buffer block from the buf_pool.free list
+@retval	NULL	if the free list is empty */
+buf_block_t* buf_LRU_get_free_only();
+
+/** Get a block from the buf_pool.free list.
+If the list is empty, blocks will be moved from the end of buf_pool.LRU
+to buf_pool.free.
+
+This function is called from a user thread when it needs a clean
+block to read in a page. Note that we only ever get a block from
+the free list. Even when we flush a page or find a page in LRU scan
+we put it to free list to be used.
+* iteration 0:
+  * get a block from the buf_pool.free list, success:done
+  * if buf_pool.try_LRU_scan is set
+    * scan LRU up to 100 pages to free a clean block
+    * success:retry the free list
+  * flush up to innodb_lru_flush_size LRU blocks to data files
+    (until UT_LIST_GET_GEN(buf_pool.free) < innodb_lru_scan_depth)
+    * on buf_page_write_complete() the blocks will put on buf_pool.free list
+    * success: retry the free list
+* subsequent iterations: same as iteration 0 except:
+  * scan whole LRU list
+  * scan LRU list even if buf_pool.try_LRU_scan is not set
+
+@param have_mutex  whether buf_pool.mutex is already being held
+@return the free control block, in state BUF_BLOCK_MEMORY */
+buf_block_t* buf_LRU_get_free_block(bool have_mutex)
+	MY_ATTRIBUTE((malloc,warn_unused_result));
+
+/** @return whether the unzip_LRU list should be used for evicting a victim
+instead of the general LRU list */
+bool buf_LRU_evict_from_unzip_LRU();
+
+/** Puts a block back to the free list.
+@param[in]	block	block; not containing a file page */
+void
+buf_LRU_block_free_non_file_page(buf_block_t* block);
+/******************************************************************//**
+Adds a block to the LRU list. Please make sure that the page_size is
+already set when invoking the function, so that we can get correct
+page_size from the buffer page when adding a block into LRU */
+void
+buf_LRU_add_block(
+/*==============*/
+	buf_page_t*	bpage,	/*!< in: control block */
+	bool		old);	/*!< in: true if should be put to the old
+				blocks in the LRU list, else put to the
+				start; if the LRU list is very short, added to
+				the start regardless of this parameter */
+/******************************************************************//**
+Adds a block to the LRU list of decompressed zip pages. */
+void
+buf_unzip_LRU_add_block(
+/*====================*/
+	buf_block_t*	block,	/*!< in: control block */
+	ibool		old);	/*!< in: TRUE if should be put to the end
+				of the list, else put to the start */
+
+/** Update buf_pool.LRU_old_ratio.
+@param[in]	old_pct		Reserve this percentage of
+				the buffer pool for "old" blocks
+@param[in]	adjust		true=adjust the LRU list;
+				false=just assign buf_pool.LRU_old_ratio
+				during the initialization of InnoDB
+@return updated old_pct */
+uint buf_LRU_old_ratio_update(uint old_pct, bool adjust);
+/********************************************************************//**
+Update the historical stats that we are collecting for LRU eviction
+policy at the end of each interval. */
+void
+buf_LRU_stat_update();
+
+#ifdef UNIV_DEBUG
+/** Validate the LRU list. */
+void buf_LRU_validate();
+#endif /* UNIV_DEBUG */
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
+/** Dump the LRU list to stderr. */
+void buf_LRU_print();
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
+
+/** @name Heuristics for detecting index scan @{ */
+/** The denominator of buf_pool.LRU_old_ratio. */
+#define BUF_LRU_OLD_RATIO_DIV	1024
+/** Maximum value of buf_pool.LRU_old_ratio.
+@see buf_LRU_old_adjust_len
+@see buf_pool.LRU_old_ratio_update */
+#define BUF_LRU_OLD_RATIO_MAX	BUF_LRU_OLD_RATIO_DIV
+/** Minimum value of buf_pool.LRU_old_ratio.
+@see buf_LRU_old_adjust_len
+@see buf_pool.LRU_old_ratio_update
+The minimum must exceed
+(BUF_LRU_OLD_TOLERANCE + 5) * BUF_LRU_OLD_RATIO_DIV / BUF_LRU_OLD_MIN_LEN. */
+#define BUF_LRU_OLD_RATIO_MIN	51
+
+#if BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX
+# error "BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX"
+#endif
+#if BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV
+# error "BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV"
+#endif
+
+/** Move blocks to "new" LRU list only if the first access was at
+least this many milliseconds ago.  Not protected by any mutex or latch. */
+extern uint	buf_LRU_old_threshold_ms;
+/* @} */
+
+/** @brief Statistics for selecting the LRU list for eviction.
+
+These statistics are not 'of' LRU but 'for' LRU.  We keep count of I/O
+and page_zip_decompress() operations.  Based on the statistics we decide
+if we want to evict from buf_pool.unzip_LRU or buf_pool.LRU. */
+struct buf_LRU_stat_t
+{
+	ulint	io;	/**< Counter of buffer pool I/O operations. */
+	ulint	unzip;	/**< Counter of page_zip_decompress operations. */
+};
+
+/** Current operation counters.  Not protected by any mutex.
+Cleared by buf_LRU_stat_update(). */
+extern buf_LRU_stat_t	buf_LRU_stat_cur;
+
+/** Running sum of past values of buf_LRU_stat_cur.
+Updated by buf_LRU_stat_update().  Protected by buf_pool.mutex. */
+extern buf_LRU_stat_t	buf_LRU_stat_sum;
+
+/********************************************************************//**
+Increments the I/O counter in buf_LRU_stat_cur. */
+#define buf_LRU_stat_inc_io() buf_LRU_stat_cur.io++
+/********************************************************************//**
+Increments the page_zip_decompress() counter in buf_LRU_stat_cur. */
+#define buf_LRU_stat_inc_unzip() buf_LRU_stat_cur.unzip++
diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h
new file mode 100644
index 00000000..3dd085dd
--- /dev/null
+++ b/storage/innobase/include/buf0rea.h
@@ -0,0 +1,120 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0rea.h
+The database buffer read
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0rea_h
+#define buf0rea_h
+
+#include "buf0buf.h"
+
+/** High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@param page_id   page id
+@param zip_size  ROW_FORMAT=COMPRESSED page size, or 0
+@retval DB_SUCCESS if the page was read and is not corrupted
+@retval DB_SUCCESS_LOCKED_REC if the page was not read
+@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted
+@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
+after decryption normal page checksum does not match.
+@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
+dberr_t buf_read_page(const page_id_t page_id, ulint zip_size);
+
+/** High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@param[in,out]	space		tablespace
+@param[in]	page_id		page id
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0 */
+void buf_read_page_background(fil_space_t *space, const page_id_t page_id,
+                              ulint zip_size)
+  MY_ATTRIBUTE((nonnull));
+
+/** Applies a random read-ahead in buf_pool if there are at least a threshold
+value of accessed pages from the random read-ahead area. Does not read any
+page, not even the one at the position (space, offset), if the read-ahead
+mechanism is not activated. NOTE 1: the calling thread may own latches on
+pages: to avoid deadlocks this function must be written such that it cannot
+end up waiting for these latches! NOTE 2: the calling thread must want
+access to the page given: this rule is set to prevent unintended read-aheads
+performed by ibuf routines, a situation which could result in a deadlock if
+the OS does not support asynchronous i/o.
+@param[in]	page_id		page id of a page which the current thread
+wants to access
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	ibuf		whether we are inside ibuf routine
+@return number of page read requests issued; NOTE that if we read ibuf
+pages, it may happen that the page at the given page number does not
+get read even if we return a positive value! */
+ulint
+buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf);
+
+/** Applies linear read-ahead if in the buf_pool the page is a border page of
+a linear read-ahead area and all the pages in the area have been accessed.
+Does not read any page if the read-ahead mechanism is not activated. Note
+that the algorithm looks at the 'natural' adjacent successor and
+predecessor of the page, which on the leaf level of a B-tree are the next
+and previous page in the chain of leaves. To know these, the page specified
+in (space, offset) must already be present in the buf_pool. Thus, the
+natural way to use this function is to call it when a page in the buf_pool
+is accessed the first time, calling this function just after it has been
+bufferfixed.
+NOTE 1: as this function looks at the natural predecessor and successor
+fields on the page, what happens, if these are not initialized to any
+sensible value? No problem, before applying read-ahead we check that the
+area to read is within the span of the space, if not, read-ahead is not
+applied. An uninitialized value may result in a useless read operation, but
+only very improbably.
+NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
+function must be written such that it cannot end up waiting for these
+latches!
+NOTE 3: the calling thread must want access to the page given: this rule is
+set to prevent unintended read-aheads performed by ibuf routines, a situation
+which could result in a deadlock if the OS does not support asynchronous io.
+@param[in]	page_id		page id; see NOTE 3 above
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	ibuf		whether if we are inside ibuf routine
+@return number of page read requests issued */
+ulint
+buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf);
+
+/** Schedule a page for recovery.
+@param space    tablespace
+@param page_id  page identifier
+@param recs     log records
+@param init     page initialization, or nullptr if the page needs to be read */
+void buf_read_recover(fil_space_t *space, const page_id_t page_id,
+                      page_recv_t &recs, recv_init *init);
+
+/** @name Modes used in read-ahead @{ */
+/** read only pages belonging to the insert buffer tree */
+#define BUF_READ_IBUF_PAGES_ONLY	131
+/** read any page */
+#define BUF_READ_ANY_PAGE		132
+/* @} */
+
+#endif
diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h
new file mode 100644
index 00000000..6c13f5ee
--- /dev/null
+++ b/storage/innobase/include/buf0types.h
@@ -0,0 +1,235 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2019, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0types.h
+The database buffer pool global types for the directory
+
+Created 11/17/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "univ.i"
+
+/** Buffer page (uncompressed or compressed) */
+class buf_page_t;
+/** Buffer block for which an uncompressed page exists */
+struct buf_block_t;
+/** Buffer pool statistics struct */
+struct buf_pool_stat_t;
+/** Buffer pool buddy statistics struct */
+struct buf_buddy_stat_t;
+
+/** A buffer frame. @see page_t */
+typedef	byte	buf_frame_t;
+
+/** Alternatives for srv_checksum_algorithm, which can be changed by
+setting innodb_checksum_algorithm */
+enum srv_checksum_algorithm_t {
+  /** Write crc32; allow full_crc32,crc32,innodb,none when reading */
+  SRV_CHECKSUM_ALGORITHM_CRC32,
+  /** Write crc32; allow full_crc23,crc32 when reading */
+  SRV_CHECKSUM_ALGORITHM_STRICT_CRC32,
+  /** For new files, always compute CRC-32C for the whole page.
+  For old files, allow crc32, innodb or none when reading. */
+  SRV_CHECKSUM_ALGORITHM_FULL_CRC32,
+  /** For new files, always compute CRC-32C for the whole page.
+  For old files, allow crc32 when reading. */
+  SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32
+};
+
+inline bool is_checksum_strict(srv_checksum_algorithm_t algo)
+{
+  return algo == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32;
+}
+
+inline bool is_checksum_strict(ulint algo)
+{
+  return algo == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32;
+}
+
+/** Parameters of binary buddy system for compressed pages (buf0buddy.h) */
+/* @{ */
+/** Zip shift value for the smallest page size */
+#define BUF_BUDDY_LOW_SHIFT	UNIV_ZIP_SIZE_SHIFT_MIN
+
+/** Smallest buddy page size */
+#define BUF_BUDDY_LOW		(1U << BUF_BUDDY_LOW_SHIFT)
+
+/** Actual number of buddy sizes based on current page size */
+#define BUF_BUDDY_SIZES		(srv_page_size_shift - BUF_BUDDY_LOW_SHIFT)
+
+/** Maximum number of buddy sizes based on the max page size */
+#define BUF_BUDDY_SIZES_MAX	(UNIV_PAGE_SIZE_SHIFT_MAX	\
+				- BUF_BUDDY_LOW_SHIFT)
+
+/** twice the maximum block size of the buddy system;
+the underlying memory is aligned by this amount:
+this must be equal to srv_page_size */
+#define BUF_BUDDY_HIGH	(BUF_BUDDY_LOW << BUF_BUDDY_SIZES)
+/* @} */
+
+/** Page identifier. */
+class page_id_t
+{
+public:
+  /** Constructor from (space, page_no).
+  @param space	 tablespace id
+  @param page_no page number */
+  constexpr page_id_t(uint32_t space, uint32_t page_no) :
+    m_id(uint64_t{space} << 32 | page_no) {}
+
+  constexpr page_id_t(uint64_t id) : m_id(id) {}
+  constexpr bool operator==(const page_id_t& rhs) const
+  { return m_id == rhs.m_id; }
+  constexpr bool operator!=(const page_id_t& rhs) const
+  { return m_id != rhs.m_id; }
+  constexpr bool operator<(const page_id_t& rhs) const
+  { return m_id < rhs.m_id; }
+  constexpr bool operator>(const page_id_t& rhs) const
+  { return m_id > rhs.m_id; }
+  constexpr bool operator<=(const page_id_t& rhs) const
+  { return m_id <= rhs.m_id; }
+  constexpr bool operator>=(const page_id_t& rhs) const
+  { return m_id >= rhs.m_id; }
+  page_id_t &operator--() { ut_ad(page_no()); m_id--; return *this; }
+  page_id_t &operator++()
+  {
+    ut_ad(page_no() < 0xFFFFFFFFU);
+    m_id++;
+    return *this;
+  }
+  page_id_t operator-(uint32_t i) const
+  {
+    ut_ad(page_no() >= i);
+    return page_id_t(m_id - i);
+  }
+  page_id_t operator+(uint32_t i) const
+  {
+    ut_ad(page_no() < ~i);
+    return page_id_t(m_id + i);
+  }
+
+  /** Retrieve the tablespace id.
+  @return tablespace id */
+  constexpr uint32_t space() const { return static_cast<uint32_t>(m_id >> 32); }
+
+  /** Retrieve the page number.
+  @return page number */
+  constexpr uint32_t page_no() const { return static_cast<uint32_t>(m_id); }
+
+  /** Retrieve the fold value.
+  @return fold value */
+  constexpr ulint fold() const
+  { return (ulint{space()} << 20) + space() + page_no(); }
+
+  /** Reset the page number only.
+  @param[in]	page_no	page number */
+  void set_page_no(uint32_t page_no)
+  {
+    m_id= (m_id & ~uint64_t{0} << 32) | page_no;
+  }
+
+  constexpr ulonglong raw() const { return m_id; }
+
+  /** Flag the page identifier as corrupted. */
+  void set_corrupted() { m_id= ~0ULL; }
+
+  /** @return whether the page identifier belongs to a corrupted page */
+  constexpr bool is_corrupted() const { return m_id == ~0ULL; }
+
+private:
+  /** The page identifier */
+  uint64_t m_id;
+};
+
+/** A 64KiB buffer of NUL bytes, for use in assertions and checks,
+and dummy default values of instantly dropped columns.
+Initially, BLOB field references are set to NUL bytes, in
+dtuple_convert_big_rec(). */
+extern const byte *field_ref_zero;
+
+#ifndef UNIV_INNOCHECKSUM
+
+/** Latch types */
+enum rw_lock_type_t
+{
+  RW_S_LATCH= 1 << 0,
+  RW_X_LATCH= 1 << 1,
+  RW_SX_LATCH= 1 << 2,
+  RW_NO_LATCH= 1 << 3
+};
+
+#include "sux_lock.h"
+
+#ifdef SUX_LOCK_GENERIC
+class page_hash_latch : private rw_lock
+{
+  /** Wait for a shared lock */
+  void read_lock_wait();
+  /** Wait for an exclusive lock */
+  void write_lock_wait();
+public:
+  /** Acquire a shared lock */
+  inline void lock_shared();
+  /** Acquire an exclusive lock */
+  inline void lock();
+
+  /** @return whether an exclusive lock is being held by any thread */
+  bool is_write_locked() const { return rw_lock::is_write_locked(); }
+
+  /** @return whether any lock is being held by any thread */
+  bool is_locked() const { return rw_lock::is_locked(); }
+  /** @return whether any lock is being held or waited for by any thread */
+  bool is_locked_or_waiting() const { return rw_lock::is_locked_or_waiting(); }
+
+  /** Release a shared lock */
+  void unlock_shared() { read_unlock(); }
+  /** Release an exclusive lock */
+  void unlock() { write_unlock(); }
+};
+#elif defined _WIN32 || SIZEOF_SIZE_T >= 8
+class page_hash_latch
+{
+  srw_spin_lock_low lk;
+public:
+  void lock_shared() { lk.rd_lock(); }
+  void unlock_shared() { lk.rd_unlock(); }
+  void lock() { lk.wr_lock(); }
+  void unlock() { lk.wr_unlock(); }
+  bool is_write_locked() const { return lk.is_write_locked(); }
+  bool is_locked() const { return lk.is_locked(); }
+  bool is_locked_or_waiting() const { return lk.is_locked_or_waiting(); }
+};
+#else
+class page_hash_latch
+{
+  srw_spin_mutex lk;
+public:
+  void lock_shared() { lock(); }
+  void unlock_shared() { unlock(); }
+  void lock() { lk.wr_lock(); }
+  void unlock() { lk.wr_unlock(); }
+  bool is_locked() const { return lk.is_locked(); }
+  bool is_write_locked() const { return is_locked(); }
+  bool is_locked_or_waiting() const { return is_locked(); }
+};
+#endif
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/data0data.h b/storage/innobase/include/data0data.h
new file mode 100644
index 00000000..a5356e0d
--- /dev/null
+++ b/storage/innobase/include/data0data.h
@@ -0,0 +1,704 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0data.h
+SQL data field and tuple
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef data0data_h
+#define data0data_h
+
+#include "data0types.h"
+#include "data0type.h"
+#include "mem0mem.h"
+#include "dict0types.h"
+#include "btr0types.h"
+#include <vector>
+
+#include <ostream>
+
+/** Storage for overflow data in a big record, that is, a clustered
+index record which needs external storage of data fields */
+struct big_rec_t;
+struct upd_t;
+
+/** Dummy variable to catch access to uninitialized fields.  In the
+debug version, dtuple_create() will make all fields of dtuple_t point
+to data_error. */
+ut_d(extern byte data_error);
+
+/*********************************************************************//**
+Sets the type struct of SQL data field. */
+UNIV_INLINE
+void
+dfield_set_type(
+/*============*/
+	dfield_t*	field,	/*!< in: SQL data field */
+	const dtype_t*	type);	/*!< in: pointer to data type struct */
+
+/*********************************************************************//**
+Sets length in a field. */
+UNIV_INLINE
+void
+dfield_set_len(
+/*===========*/
+	dfield_t*	field,	/*!< in: field */
+	ulint		len)	/*!< in: length or UNIV_SQL_NULL */
+	MY_ATTRIBUTE((nonnull));
+
+/** Gets spatial status for "external storage"
+@param[in,out]	field		field */
+UNIV_INLINE
+spatial_status_t
+dfield_get_spatial_status(
+	const dfield_t*	field);
+
+/** Sets spatial status for "external storage"
+@param[in,out]	field		field
+@param[in]	spatial_status	spatial status */
+UNIV_INLINE
+void
+dfield_set_spatial_status(
+	dfield_t*		field,
+	spatial_status_t	spatial_status);
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_set_data(
+/*============*/
+	dfield_t*	field,	/*!< in: field */
+	const void*	data,	/*!< in: data */
+	ulint		len)	/*!< in: length or UNIV_SQL_NULL */
+	MY_ATTRIBUTE((nonnull(1)));
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_write_mbr(
+/*=============*/
+	dfield_t*	field,	/*!< in: field */
+	const double*	mbr)	/*!< in: data */
+	MY_ATTRIBUTE((nonnull(1)));
+/*********************************************************************//**
+Sets a data field to SQL NULL. */
+UNIV_INLINE
+void
+dfield_set_null(
+/*============*/
+	dfield_t*	field)	/*!< in/out: field */
+	MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Writes an SQL null field full of zeros. */
+UNIV_INLINE
+void
+data_write_sql_null(
+/*================*/
+	byte*	data,	/*!< in: pointer to a buffer of size len */
+	ulint	len)	/*!< in: SQL null size in bytes */
+	MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Copies the data and len fields. */
+UNIV_INLINE
+void
+dfield_copy_data(
+/*=============*/
+	dfield_t*	field1,		/*!< out: field to copy to */
+	const dfield_t*	field2);	/*!< in: field to copy from */
+
+/*********************************************************************//**
+Copies a data field to another. */
+UNIV_INLINE
+void
+dfield_copy(
+/*========*/
+	dfield_t*	field1,	/*!< out: field to copy to */
+	const dfield_t*	field2)	/*!< in: field to copy from */
+	MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Copies the data pointed to by a data field. */
+UNIV_INLINE
+void
+dfield_dup(
+/*=======*/
+	dfield_t*	field,	/*!< in/out: data field */
+	mem_heap_t*	heap)	/*!< in: memory heap where allocated */
+	MY_ATTRIBUTE((nonnull));
+
+/*********************************************************************//**
+Tests if two data fields are equal.
+If len==0, tests the data length and content for equality.
+If len>0, tests the first len bytes of the content for equality.
+@return TRUE if both fields are NULL or if they are equal */
+UNIV_INLINE
+ibool
+dfield_datas_are_binary_equal(
+/*==========================*/
+	const dfield_t*	field1,	/*!< in: field */
+	const dfield_t*	field2,	/*!< in: field */
+	ulint		len)	/*!< in: maximum prefix to compare,
+				or 0 to compare the whole field length */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Tests if dfield data length and content is equal to the given.
+@return TRUE if equal */
+UNIV_INLINE
+ibool
+dfield_data_is_binary_equal(
+/*========================*/
+	const dfield_t*	field,	/*!< in: field */
+	ulint		len,	/*!< in: data length or UNIV_SQL_NULL */
+	const byte*	data)	/*!< in: data */
+	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/*********************************************************************//**
+Gets info bits in a data tuple.
+@return info bits */
+UNIV_INLINE
+ulint
+dtuple_get_info_bits(
+/*=================*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Sets info bits in a data tuple. */
+UNIV_INLINE
+void
+dtuple_set_info_bits(
+/*=================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		info_bits)	/*!< in: info bits */
+	MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Gets number of fields used in record comparisons.
+@return number of fields used in comparisons in rem0cmp.* */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields_cmp(
+/*====================*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets number of fields used in record comparisons. */
+UNIV_INLINE
+void
+dtuple_set_n_fields_cmp(
+/*====================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		n_fields_cmp)	/*!< in: number of fields used in
+					comparisons in rem0cmp.* */
+	MY_ATTRIBUTE((nonnull));
+
+/* Estimate the number of bytes that are going to be allocated when
+creating a new dtuple_t object */
+#define DTUPLE_EST_ALLOC(n_fields)	\
+	(sizeof(dtuple_t) + (n_fields) * sizeof(dfield_t))
+
+/** Creates a data tuple from an already allocated chunk of memory.
+The size of the chunk must be at least DTUPLE_EST_ALLOC(n_fields).
+The default value for number of fields used in record comparisons
+for this tuple is n_fields.
+@param[in,out]	buf		buffer to use
+@param[in]	buf_size	buffer size
+@param[in]	n_fields	number of field
+@param[in]	n_v_fields	number of fields on virtual columns
+@return created tuple (inside buf) */
+UNIV_INLINE
+dtuple_t*
+dtuple_create_from_mem(
+	void*	buf,
+	ulint	buf_size,
+	ulint	n_fields,
+	ulint	n_v_fields)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**********************************************************//**
+Creates a data tuple to a memory heap. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create(
+/*==========*/
+	mem_heap_t*	heap,	/*!< in: memory heap where the tuple
+				is created, DTUPLE_EST_ALLOC(n_fields)
+				bytes will be allocated from this heap */
+	ulint		n_fields)/*!< in: number of fields */
+	MY_ATTRIBUTE((nonnull, malloc));
+
+/** Initialize the virtual field data in a dtuple_t
+@param[in,out]		vrow	dtuple contains the virtual fields */
+UNIV_INLINE void dtuple_init_v_fld(dtuple_t* vrow);
+
+/** Duplicate the virtual field data in a dtuple_t
+@param[in,out]		vrow	dtuple contains the virtual fields
+@param[in]		heap	heap memory to use */
+UNIV_INLINE void dtuple_dup_v_fld(dtuple_t* vrow, mem_heap_t* heap);
+
+/** Creates a data tuple with possible virtual columns to a memory heap.
+@param[in]	heap		memory heap where the tuple is created
+@param[in]	n_fields	number of fields
+@param[in]	n_v_fields	number of fields on virtual col
+@return own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create_with_vcol(
+	mem_heap_t*	heap,
+	ulint		n_fields,
+	ulint		n_v_fields);
+
+/*********************************************************************//**
+Sets number of fields used in a tuple. Normally this is set in
+dtuple_create, but if you want later to set it smaller, you can use this. */
+void
+dtuple_set_n_fields(
+/*================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		n_fields)	/*!< in: number of fields */
+	MY_ATTRIBUTE((nonnull));
+/** Copies a data tuple's virtaul fields to another. This is a shallow copy;
+@param[in,out]	d_tuple		destination tuple
+@param[in]	s_tuple		source tuple */
+UNIV_INLINE
+void
+dtuple_copy_v_fields(
+	dtuple_t*	d_tuple,
+	const dtuple_t*	s_tuple);
+/*********************************************************************//**
+Copies a data tuple to another.  This is a shallow copy; if a deep copy
+is desired, dfield_dup() will have to be invoked on each field.
+@return own: copy of tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_copy(
+/*========*/
+	const dtuple_t*	tuple,	/*!< in: tuple to copy from */
+	mem_heap_t*	heap)	/*!< in: memory heap
+				where the tuple is created */
+	MY_ATTRIBUTE((nonnull, malloc));
+/**********************************************************//**
+The following function returns the sum of data lengths of a tuple. The space
+occupied by the field structs or the tuple struct is not counted.
+@return sum of data lens */
+UNIV_INLINE
+ulint
+dtuple_get_data_size(
+/*=================*/
+	const dtuple_t*	tuple,	/*!< in: typed data tuple */
+	ulint		comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+	MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Computes the number of externally stored fields in a data tuple.
+@return number of fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_ext(
+/*=============*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	MY_ATTRIBUTE((nonnull));
+/** Fold a prefix given as the number of fields of a tuple.
+@param[in]	tuple		index record
+@param[in]	n_fields	number of complete fields to fold
+@param[in]	n_bytes		number of bytes to fold in the last field
+@param[in]	index_id	index tree ID
+@return the folded value */
+UNIV_INLINE
+ulint
+dtuple_fold(
+	const dtuple_t*	tuple,
+	ulint		n_fields,
+	ulint		n_bytes,
+	index_id_t	tree_id)
+	MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************************//**
+Sets types of fields binary in a tuple. */
+UNIV_INLINE
+void
+dtuple_set_types_binary(
+/*====================*/
+	dtuple_t*	tuple,	/*!< in: data tuple */
+	ulint		n)	/*!< in: number of fields to set */
+	MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Checks if a dtuple contains an SQL null value.
+@return TRUE if some field is SQL null */
+UNIV_INLINE
+ibool
+dtuple_contains_null(
+/*=================*/
+	const dtuple_t*	tuple)	/*!< in: dtuple */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/**********************************************************//**
+Checks that a data field is typed. Asserts an error if not.
+@return TRUE if ok */
+ibool
+dfield_check_typed(
+/*===============*/
+	const dfield_t*	field)	/*!< in: data field */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/**********************************************************//**
+Checks that a data tuple is typed. Asserts an error if not.
+@return TRUE if ok */
+ibool
+dtuple_check_typed(
+/*===============*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Validates the consistency of a tuple which must be complete, i.e,
+all fields must have been set.
+@return TRUE if ok */
+ibool
+dtuple_validate(
+/*============*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. */
+void
+dfield_print(
+/*=========*/
+	const dfield_t*	dfield)	/*!< in: dfield */
+	MY_ATTRIBUTE((nonnull));
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. Also the hex string
+is printed if a string contains non-printable characters. */
+void
+dfield_print_also_hex(
+/*==================*/
+	const dfield_t*	dfield)	 /*!< in: dfield */
+	MY_ATTRIBUTE((nonnull));
+/**********************************************************//**
+The following function prints the contents of a tuple. */
+void
+dtuple_print(
+/*=========*/
+	FILE*		f,	/*!< in: output stream */
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	MY_ATTRIBUTE((nonnull));
+
+/** Print the contents of a tuple.
+@param[out]	o	output stream
+@param[in]	field	array of data fields
+@param[in]	n	number of data fields */
+void
+dfield_print(
+	std::ostream&	o,
+	const dfield_t*	field,
+	ulint		n);
+/** Print the contents of a tuple.
+@param[out]	o	output stream
+@param[in]	tuple	data tuple */
+void
+dtuple_print(
+	std::ostream&	o,
+	const dtuple_t*	tuple);
+
+/** Print the contents of a tuple.
+@param[out]	o	output stream
+@param[in]	tuple	data tuple */
+inline
+std::ostream&
+operator<<(std::ostream& o, const dtuple_t& tuple)
+{
+	dtuple_print(o, &tuple);
+	return(o);
+}
+
+/**************************************************************//**
+Moves parts of long fields in entry to the big record vector so that
+the size of tuple drops below the maximum record size allowed in the
+database. Moves data only from those fields which are not necessary
+to determine uniquely the insertion place of the tuple in the index.
+@return own: created big record vector, NULL if we are not able to
+shorten the entry enough, i.e., if there are too many fixed-length or
+short fields in entry or the index is clustered */
+big_rec_t*
+dtuple_convert_big_rec(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index */
+	upd_t*		upd,	/*!< in/out: update vector */
+	dtuple_t*	entry,	/*!< in/out: index entry */
+	ulint*		n_ext)	/*!< in/out: number of
+				externally stored columns */
+	MY_ATTRIBUTE((malloc, warn_unused_result));
+/**************************************************************//**
+Puts back to entry the data stored in vector. Note that to ensure the
+fields in entry can accommodate the data, vector must have been created
+from entry with dtuple_convert_big_rec. */
+void
+dtuple_convert_back_big_rec(
+/*========================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: entry whose data was put to vector */
+	big_rec_t*	vector)	/*!< in, own: big rec vector; it is
+				freed in this function */
+	MY_ATTRIBUTE((nonnull));
+/**************************************************************//**
+Frees the memory in a big rec vector. */
+UNIV_INLINE
+void
+dtuple_big_rec_free(
+/*================*/
+	big_rec_t*	vector)	/*!< in, own: big rec vector; it is
+				freed in this function */
+	MY_ATTRIBUTE((nonnull));
+
+/*######################################################################*/
+
+/** Structure for an SQL data field */
+struct dfield_t{
+	void*		data;	/*!< pointer to data */
+	unsigned	ext:1;	/*!< TRUE=externally stored, FALSE=local */
+	unsigned	spatial_status:2;
+				/*!< spatial status of externally stored field
+				in undo log for purge */
+	unsigned	len;	/*!< data length; UNIV_SQL_NULL if SQL null */
+	dtype_t		type;	/*!< type of data */
+
+	/** Create a deep copy of this object.
+	@param[in,out]	heap	memory heap in which the clone will be created
+	@return	the cloned object */
+	dfield_t* clone(mem_heap_t* heap) const;
+
+	/** @return system field indicates history row */
+	bool vers_history_row() const
+	{
+		ut_ad(type.vers_sys_end());
+		if (type.mtype == DATA_FIXBINARY) {
+			ut_ad(len == sizeof timestamp_max_bytes);
+			return 0 != memcmp(data, timestamp_max_bytes, len);
+		} else {
+			ut_ad(type.mtype == DATA_INT);
+			ut_ad(len == sizeof trx_id_max_bytes);
+			return 0 != memcmp(data, trx_id_max_bytes, len);
+		}
+		ut_ad(0);
+		return false;
+	}
+};
+
+/** Structure for an SQL data tuple of fields (logical record) */
+struct dtuple_t {
+	ulint		info_bits;	/*!< info bits of an index record:
+					the default is 0; this field is used
+					if an index record is built from
+					a data tuple */
+	ulint		n_fields;	/*!< number of fields in dtuple */
+	ulint		n_fields_cmp;	/*!< number of fields which should
+					be used in comparison services
+					of rem0cmp.*; the index search
+					is performed by comparing only these
+					fields, others are ignored; the
+					default value in dtuple creation is
+					the same value as n_fields */
+	dfield_t*	fields;		/*!< fields */
+	ulint		n_v_fields;	/*!< number of virtual fields */
+	dfield_t*	v_fields;	/*!< fields on virtual column */
+#ifdef UNIV_DEBUG
+	ulint		magic_n;	/*!< magic number, used in
+					debug assertions */
+/** Value of dtuple_t::magic_n */
+# define		DATA_TUPLE_MAGIC_N	65478679
+#endif /* UNIV_DEBUG */
+
+	/** Trim the tail of an index tuple before insert or update.
+	After instant ADD COLUMN, if the last fields of a clustered index tuple
+	match the default values that were explicitly specified or implied
+	during ADD COLUMN, there will be no need to store them.
+	NOTE: A page latch in the index must be held, so that the index
+	may not lose 'instantness' before the trimmed tuple has been
+	inserted or updated.
+	@param[in]	index	index possibly with instantly added columns */
+	void trim(const dict_index_t& index);
+
+	bool vers_history_row() const
+	{
+		for (ulint i = 0; i < n_fields; i++) {
+			const dfield_t* field = &fields[i];
+			if (field->type.vers_sys_end()) {
+				return field->vers_history_row();
+			}
+		}
+		return false;
+	}
+
+	/**
+	@param info_bits	the info_bits of a data tuple
+	@return whether this is a hidden metadata record
+	for instant ADD COLUMN or ALTER TABLE */
+	static bool is_alter_metadata(ulint info_bits)
+	{
+		return UNIV_UNLIKELY(info_bits == REC_INFO_METADATA_ALTER);
+	}
+
+	/**
+	@param info_bits	the info_bits of a data tuple
+	@return whether this is a hidden metadata record
+	for instant ADD COLUMN or ALTER TABLE */
+	static bool is_metadata(ulint info_bits)
+	{
+		return UNIV_UNLIKELY((info_bits & ~REC_INFO_DELETED_FLAG)
+				     == REC_INFO_METADATA_ADD);
+	}
+
+	/** @return whether this is a hidden metadata record
+	for instant ALTER TABLE (not only ADD COLUMN) */
+	bool is_alter_metadata() const { return is_alter_metadata(info_bits); }
+
+	/** @return whether this is a hidden metadata record
+	for instant ADD COLUMN or ALTER TABLE */
+	bool is_metadata() const { return is_metadata(info_bits); }
+
+	/** Copy type information from index fields.
+	@param index	index field to be copied */
+	inline void copy_field_types(const dict_index_t &index);
+};
+
+inline ulint dtuple_get_n_fields(const dtuple_t* tuple)
+{ return tuple->n_fields; }
+inline dtype_t* dfield_get_type(dfield_t* field) { return &field->type; }
+inline const dtype_t* dfield_get_type(const dfield_t* field)
+{ return &field->type; }
+inline void* dfield_get_data(dfield_t* field)
+{
+	ut_ad(field->len == UNIV_SQL_NULL || field->data != &data_error);
+	return field->data;
+}
+inline const void* dfield_get_data(const dfield_t* field)
+{
+	ut_ad(field->len == UNIV_SQL_NULL || field->data != &data_error);
+	return field->data;
+}
+inline ulint dfield_get_len(const dfield_t* field) {
+	ut_ad(field->len == UNIV_SQL_NULL || field->data != &data_error);
+	ut_ad(field->len != UNIV_SQL_DEFAULT);
+	return field->len;
+}
+inline bool dfield_is_null(const dfield_t* field)
+{ return field->len == UNIV_SQL_NULL; }
+/** @return whether a column is to be stored off-page */
+inline bool dfield_is_ext(const dfield_t* field)
+{
+	ut_ad(!field->ext || field->len >= BTR_EXTERN_FIELD_REF_SIZE);
+	return static_cast<bool>(field->ext);
+}
+/** Set the "external storage" flag */
+inline void dfield_set_ext(dfield_t* field) { field->ext = 1; }
+
+/** Gets number of virtual fields in a data tuple.
+@param[in]	tuple	dtuple to check
+@return number of fields */
+inline ulint
+dtuple_get_n_v_fields(const dtuple_t* tuple) { return tuple->n_v_fields; }
+
+inline const dfield_t* dtuple_get_nth_field(const dtuple_t* tuple, ulint n)
+{
+	ut_ad(n < tuple->n_fields);
+	return &tuple->fields[n];
+}
+inline dfield_t* dtuple_get_nth_field(dtuple_t* tuple, ulint n)
+{
+	ut_ad(n < tuple->n_fields);
+	return &tuple->fields[n];
+}
+
+/** Get a virtual column in a table row or an extended clustered index record.
+@param[in]	tuple	tuple
+@oaran[in]	n	the nth virtual field to get
+@return nth virtual field */
+inline const dfield_t* dtuple_get_nth_v_field(const dtuple_t* tuple, ulint n)
+{
+	ut_ad(n < tuple->n_v_fields);
+	return &tuple->v_fields[n];
+}
+/** Get a virtual column in a table row or an extended clustered index record.
+@param[in]	tuple	tuple
+@oaran[in]	n	the nth virtual field to get
+@return nth virtual field */
+inline dfield_t* dtuple_get_nth_v_field(dtuple_t* tuple, ulint n)
+{
+	ut_ad(n < tuple->n_v_fields);
+	return &tuple->v_fields[n];
+}
+
+/** A slot for a field in a big rec vector */
+struct big_rec_field_t {
+
+	/** Constructor.
+	@param[in]	field_no_	the field number
+	@param[in]	len_		the data length
+	@param[in]	data_		the data */
+	big_rec_field_t(ulint field_no_, ulint len_, const void* data_)
+		: field_no(field_no_),
+		  len(len_),
+		  data(data_)
+	{}
+
+	ulint		field_no;	/*!< field number in record */
+	ulint		len;		/*!< stored data length, in bytes */
+	const void*	data;		/*!< stored data */
+};
+
+/** Storage format for overflow data in a big record, that is, a
+clustered index record which needs external storage of data fields */
+struct big_rec_t {
+	mem_heap_t*	heap;		/*!< memory heap from which
+					allocated */
+	const ulint	capacity;	/*!< fields array size */
+	ulint		n_fields;	/*!< number of stored fields */
+	big_rec_field_t*fields;		/*!< stored fields */
+
+	/** Constructor.
+	@param[in]	max	the capacity of the array of fields. */
+	explicit big_rec_t(const ulint max)
+		: heap(0),
+		  capacity(max),
+		  n_fields(0),
+		  fields(0)
+	{}
+
+	/** Append one big_rec_field_t object to the end of array of fields */
+	void append(const big_rec_field_t& field)
+	{
+		ut_ad(n_fields < capacity);
+		fields[n_fields] = field;
+		n_fields++;
+	}
+
+	/** Allocate a big_rec_t object in the given memory heap, and for
+	storing n_fld number of fields.
+	@param[in]	heap	memory heap in which this object is allocated
+	@param[in]	n_fld	maximum number of fields that can be stored in
+			this object
+	@return the allocated object */
+	static big_rec_t* alloc(
+		mem_heap_t*	heap,
+		ulint		n_fld);
+};
+
+#include "data0data.inl"
+
+#endif
diff --git a/storage/innobase/include/data0data.inl b/storage/innobase/include/data0data.inl
new file mode 100644
index 00000000..2d1bf5a2
--- /dev/null
+++ b/storage/innobase/include/data0data.inl
@@ -0,0 +1,633 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0data.ic
+SQL data field and tuple
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "ut0rnd.h"
+
+/*********************************************************************//**
+Sets the type struct of SQL data field. */
+UNIV_INLINE
+void
+dfield_set_type(
+/*============*/
+	dfield_t*	field,	/*!< in: SQL data field */
+	const dtype_t*	type)	/*!< in: pointer to data type struct */
+{
+	ut_ad(field != NULL);
+	ut_ad(type != NULL);
+
+	field->type = *type;
+}
+
+/*********************************************************************//**
+Sets length in a field. */
+UNIV_INLINE
+void
+dfield_set_len(
+/*===========*/
+	dfield_t*	field,	/*!< in: field */
+	ulint		len)	/*!< in: length or UNIV_SQL_NULL */
+{
+	ut_ad(len != UNIV_SQL_DEFAULT);
+	field->ext = 0;
+	field->len = static_cast<unsigned int>(len);
+}
+
+/** Gets spatial status for "external storage"
+@param[in,out]	field		field */
+UNIV_INLINE
+spatial_status_t
+dfield_get_spatial_status(
+	const dfield_t*	field)
+{
+	ut_ad(dfield_is_ext(field));
+
+	return(static_cast<spatial_status_t>(field->spatial_status));
+}
+
+/** Sets spatial status for "external storage"
+@param[in,out]	field		field
+@param[in]	spatial_status	spatial status */
+UNIV_INLINE
+void
+dfield_set_spatial_status(
+	dfield_t*		field,
+	spatial_status_t	spatial_status)
+{
+	field->spatial_status = spatial_status & 3;
+	ut_ad(dfield_get_spatial_status(field) == spatial_status);
+}
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_set_data(
+/*============*/
+	dfield_t*	field,	/*!< in: field */
+	const void*	data,	/*!< in: data */
+	ulint		len)	/*!< in: length or UNIV_SQL_NULL */
+{
+	field->data = (void*) data;
+	field->ext = 0;
+	field->len = static_cast<unsigned int>(len);
+}
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_write_mbr(
+/*=============*/
+	dfield_t*	field,	/*!< in: field */
+	const double*	mbr)	/*!< in: data */
+{
+	MEM_CHECK_DEFINED(mbr, sizeof *mbr);
+	field->ext = 0;
+
+	for (unsigned i = 0; i < SPDIMS * 2; i++) {
+		mach_double_write(static_cast<byte*>(field->data)
+				  + i * sizeof(double), mbr[i]);
+	}
+
+	field->len = DATA_MBR_LEN;
+}
+
+/*********************************************************************//**
+Sets a data field to SQL NULL. */
+UNIV_INLINE
+void
+dfield_set_null(
+/*============*/
+	dfield_t*	field)	/*!< in/out: field */
+{
+	dfield_set_data(field, NULL, UNIV_SQL_NULL);
+}
+
+/*********************************************************************//**
+Copies the data and len fields. */
+UNIV_INLINE
+void
+dfield_copy_data(
+/*=============*/
+	dfield_t*	field1,	/*!< out: field to copy to */
+	const dfield_t*	field2)	/*!< in: field to copy from */
+{
+	ut_ad(field1 != NULL);
+	ut_ad(field2 != NULL);
+
+	field1->data = field2->data;
+	field1->len = field2->len;
+	field1->ext = field2->ext;
+	field1->spatial_status = field2->spatial_status;
+}
+
+/*********************************************************************//**
+Copies a data field to another. */
+UNIV_INLINE
+void
+dfield_copy(
+/*========*/
+	dfield_t*	field1,	/*!< out: field to copy to */
+	const dfield_t*	field2)	/*!< in: field to copy from */
+{
+	*field1 = *field2;
+}
+
+/*********************************************************************//**
+Copies the data pointed to by a data field. */
+UNIV_INLINE
+void
+dfield_dup(
+/*=======*/
+	dfield_t*	field,	/*!< in/out: data field */
+	mem_heap_t*	heap)	/*!< in: memory heap where allocated */
+{
+	if (!dfield_is_null(field)) {
+		MEM_CHECK_DEFINED(field->data, field->len);
+		field->data = mem_heap_dup(heap, field->data, field->len);
+	}
+}
+
+/*********************************************************************//**
+Tests if two data fields are equal.
+If len==0, tests the data length and content for equality.
+If len>0, tests the first len bytes of the content for equality.
+@return TRUE if both fields are NULL or if they are equal */
+UNIV_INLINE
+ibool
+dfield_datas_are_binary_equal(
+/*==========================*/
+	const dfield_t*	field1,	/*!< in: field */
+	const dfield_t*	field2,	/*!< in: field */
+	ulint		len)	/*!< in: maximum prefix to compare,
+				or 0 to compare the whole field length */
+{
+	ulint	len2 = len;
+
+	if (field1->len == UNIV_SQL_NULL || len == 0 || field1->len < len) {
+		len = field1->len;
+	}
+
+	if (field2->len == UNIV_SQL_NULL || len2 == 0 || field2->len < len2) {
+		len2 = field2->len;
+	}
+
+	return(len == len2
+	       && (len == UNIV_SQL_NULL
+		   || !memcmp(field1->data, field2->data, len)));
+}
+
+/*********************************************************************//**
+Tests if dfield data length and content is equal to the given.
+@return TRUE if equal */
+UNIV_INLINE
+ibool
+dfield_data_is_binary_equal(
+/*========================*/
+	const dfield_t*	field,	/*!< in: field */
+	ulint		len,	/*!< in: data length or UNIV_SQL_NULL */
+	const byte*	data)	/*!< in: data */
+{
+	ut_ad(len != UNIV_SQL_DEFAULT);
+	return(len == dfield_get_len(field)
+	       && (!len || len == UNIV_SQL_NULL
+		   || !memcmp(dfield_get_data(field), data, len)));
+}
+
+/*********************************************************************//**
+Gets info bits in a data tuple.
+@return info bits */
+UNIV_INLINE
+ulint
+dtuple_get_info_bits(
+/*=================*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	return(tuple->info_bits);
+}
+
+/*********************************************************************//**
+Sets info bits in a data tuple. */
+UNIV_INLINE
+void
+dtuple_set_info_bits(
+/*=================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		info_bits)	/*!< in: info bits */
+{
+	tuple->info_bits = info_bits;
+}
+
+/*********************************************************************//**
+Gets number of fields used in record comparisons.
+@return number of fields used in comparisons in rem0cmp.* */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields_cmp(
+/*====================*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	return(tuple->n_fields_cmp);
+}
+
+/*********************************************************************//**
+Sets number of fields used in record comparisons. */
+UNIV_INLINE
+void
+dtuple_set_n_fields_cmp(
+/*====================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		n_fields_cmp)	/*!< in: number of fields used in
+					comparisons in rem0cmp.* */
+{
+	ut_ad(n_fields_cmp <= tuple->n_fields);
+	tuple->n_fields_cmp = n_fields_cmp;
+}
+
+/** Creates a data tuple from an already allocated chunk of memory.
+The size of the chunk must be at least DTUPLE_EST_ALLOC(n_fields).
+The default value for number of fields used in record comparisons
+for this tuple is n_fields.
+@param[in,out]	buf		buffer to use
+@param[in]	buf_size	buffer size
+@param[in]	n_fields	number of field
+@param[in]	n_v_fields	number of fields on virtual columns
+@return created tuple (inside buf) */
+UNIV_INLINE
+dtuple_t*
+dtuple_create_from_mem(
+	void*	buf,
+	ulint	buf_size,
+	ulint	n_fields,
+	ulint	n_v_fields)
+{
+	dtuple_t*	tuple;
+	ulint		n_t_fields = n_fields + n_v_fields;
+
+	ut_a(buf_size >= DTUPLE_EST_ALLOC(n_t_fields));
+
+	tuple = (dtuple_t*) buf;
+	tuple->info_bits = 0;
+	tuple->n_fields = n_fields;
+	tuple->n_v_fields = n_v_fields;
+	tuple->n_fields_cmp = n_fields;
+	tuple->fields = (dfield_t*) &tuple[1];
+	if (n_v_fields > 0) {
+		tuple->v_fields = &tuple->fields[n_fields];
+	} else {
+		tuple->v_fields = NULL;
+	}
+
+#ifdef UNIV_DEBUG
+	tuple->magic_n = DATA_TUPLE_MAGIC_N;
+
+	{	/* In the debug version, initialize fields to an error value */
+		ulint	i;
+
+		for (i = 0; i < n_t_fields; i++) {
+			dfield_t*       field;
+
+			if (i >= n_fields) {
+				field = dtuple_get_nth_v_field(
+					tuple, i - n_fields);
+			} else {
+				field = dtuple_get_nth_field(tuple, i);
+			}
+
+			dfield_set_len(field, UNIV_SQL_NULL);
+			field->data = &data_error;
+			dfield_get_type(field)->mtype = DATA_ERROR;
+			dfield_get_type(field)->prtype = DATA_ERROR;
+		}
+	}
+#endif
+	MEM_CHECK_ADDRESSABLE(tuple->fields, n_t_fields
+			      * sizeof *tuple->fields);
+	MEM_UNDEFINED(tuple->fields, n_t_fields * sizeof *tuple->fields);
+	return(tuple);
+}
+
+/** Duplicate the virtual field data in a dtuple_t
+@param[in,out]		vrow	dtuple contains the virtual fields
+@param[in,out]		heap	heap memory to use */
+UNIV_INLINE
+void
+dtuple_dup_v_fld(dtuple_t* vrow, mem_heap_t* heap)
+{
+	for (ulint i = 0; i < vrow->n_v_fields; i++) {
+		dfield_t*       dfield = dtuple_get_nth_v_field(vrow, i);
+		dfield_dup(dfield, heap);
+	}
+}
+
+/** Initialize the virtual field data in a dtuple_t
+@param[in,out]		vrow	dtuple contains the virtual fields */
+UNIV_INLINE
+void
+dtuple_init_v_fld(dtuple_t* vrow)
+{
+	for (ulint i = 0; i < vrow->n_v_fields; i++) {
+		dfield_t*       dfield = dtuple_get_nth_v_field(vrow, i);
+		dfield_get_type(dfield)->mtype = DATA_MISSING;
+		dfield_set_len(dfield, UNIV_SQL_NULL);
+	}
+}
+
+/**********************************************************//**
+Creates a data tuple to a memory heap. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create(
+/*==========*/
+	mem_heap_t*	heap,	/*!< in: memory heap where the tuple
+				is created, DTUPLE_EST_ALLOC(n_fields)
+				bytes will be allocated from this heap */
+	ulint		n_fields) /*!< in: number of fields */
+{
+	return(dtuple_create_with_vcol(heap, n_fields, 0));
+}
+
+/** Creates a data tuple with virtual columns to a memory heap.
+@param[in]	heap		memory heap where the tuple is created
+@param[in]	n_fields	number of fields
+@param[in]	n_v_fields	number of fields on virtual col
+@return own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create_with_vcol(
+	mem_heap_t*	heap,
+	ulint		n_fields,
+	ulint		n_v_fields)
+{
+	void*		buf;
+	ulint		buf_size;
+	dtuple_t*	tuple;
+
+	ut_ad(heap);
+
+	buf_size = DTUPLE_EST_ALLOC(n_fields + n_v_fields);
+	buf = mem_heap_alloc(heap, buf_size);
+
+	tuple = dtuple_create_from_mem(buf, buf_size, n_fields, n_v_fields);
+
+	return(tuple);
+}
+
+/** Copies a data tuple's virtual fields to another. This is a shallow copy;
+@param[in,out]	d_tuple		destination tuple
+@param[in]	s_tuple		source tuple */
+UNIV_INLINE
+void
+dtuple_copy_v_fields(
+	dtuple_t*	d_tuple,
+	const dtuple_t*	s_tuple)
+{
+
+	ulint		n_v_fields	= dtuple_get_n_v_fields(d_tuple);
+	ut_ad(n_v_fields == dtuple_get_n_v_fields(s_tuple));
+
+	for (ulint i = 0; i < n_v_fields; i++) {
+		dfield_copy(dtuple_get_nth_v_field(d_tuple, i),
+			    dtuple_get_nth_v_field(s_tuple, i));
+	}
+}
+
+/*********************************************************************//**
+Copies a data tuple to another.  This is a shallow copy; if a deep copy
+is desired, dfield_dup() will have to be invoked on each field.
+@return own: copy of tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_copy(
+/*========*/
+	const dtuple_t*	tuple,	/*!< in: tuple to copy from */
+	mem_heap_t*	heap)	/*!< in: memory heap
+				where the tuple is created */
+{
+	ulint		n_fields	= dtuple_get_n_fields(tuple);
+	ulint		n_v_fields	= dtuple_get_n_v_fields(tuple);
+	dtuple_t*	new_tuple	= dtuple_create_with_vcol(
+						heap, n_fields, n_v_fields);
+	ulint		i;
+
+	for (i = 0; i < n_fields; i++) {
+		dfield_copy(dtuple_get_nth_field(new_tuple, i),
+			    dtuple_get_nth_field(tuple, i));
+	}
+
+	for (i = 0; i < n_v_fields; i++) {
+		dfield_copy(dtuple_get_nth_v_field(new_tuple, i),
+			    dtuple_get_nth_v_field(tuple, i));
+	}
+
+	return(new_tuple);
+}
+
+/**********************************************************//**
+The following function returns the sum of data lengths of a tuple. The space
+occupied by the field structs or the tuple struct is not counted. Neither
+is possible space in externally stored parts of the field.
+@return sum of data lengths */
+UNIV_INLINE
+ulint
+dtuple_get_data_size(
+/*=================*/
+	const dtuple_t*	tuple,	/*!< in: typed data tuple */
+	ulint		comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+{
+	const dfield_t*	field;
+	ulint		n_fields;
+	ulint		len;
+	ulint		i;
+	ulint		sum	= 0;
+
+	ut_ad(dtuple_check_typed(tuple));
+	ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+
+	n_fields = tuple->n_fields;
+
+	for (i = 0; i < n_fields; i++) {
+		field = dtuple_get_nth_field(tuple,  i);
+		len = dfield_get_len(field);
+
+		if (len == UNIV_SQL_NULL) {
+			len = dtype_get_sql_null_size(dfield_get_type(field),
+						      comp);
+		}
+
+		sum += len;
+	}
+
+	return(sum);
+}
+
+/*********************************************************************//**
+Computes the number of externally stored fields in a data tuple.
+@return number of externally stored fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_ext(
+/*=============*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	ulint	n_ext		= 0;
+	ulint	n_fields	= tuple->n_fields;
+	ulint	i;
+
+	ut_ad(dtuple_check_typed(tuple));
+	ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+
+	for (i = 0; i < n_fields; i++) {
+		n_ext += dtuple_get_nth_field(tuple, i)->ext;
+	}
+
+	return(n_ext);
+}
+
+/*******************************************************************//**
+Sets types of fields binary in a tuple. */
+UNIV_INLINE
+void
+dtuple_set_types_binary(
+/*====================*/
+	dtuple_t*	tuple,	/*!< in: data tuple */
+	ulint		n)	/*!< in: number of fields to set */
+{
+	dtype_t*	dfield_type;
+	ulint		i;
+
+	for (i = 0; i < n; i++) {
+		dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i));
+		dtype_set(dfield_type, DATA_BINARY, 0, 0);
+	}
+}
+
+/** Fold a prefix given as the number of fields of a tuple.
+@param[in]	tuple		index record
+@param[in]	n_fields	number of complete fields to fold
+@param[in]	n_bytes		number of bytes to fold in the last field
+@param[in]	index_id	index tree ID
+@return the folded value */
+UNIV_INLINE
+ulint
+dtuple_fold(
+	const dtuple_t*	tuple,
+	ulint		n_fields,
+	ulint		n_bytes,
+	index_id_t	tree_id)
+{
+	const dfield_t*	field;
+	ulint		i;
+	const byte*	data;
+	ulint		len;
+	ulint		fold;
+
+	ut_ad(tuple);
+	ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+	ut_ad(dtuple_check_typed(tuple));
+
+	fold = ut_fold_ull(tree_id);
+
+	for (i = 0; i < n_fields; i++) {
+		field = dtuple_get_nth_field(tuple, i);
+
+		data = (const byte*) dfield_get_data(field);
+		len = dfield_get_len(field);
+
+		if (len != UNIV_SQL_NULL) {
+			fold = ut_fold_ulint_pair(fold,
+						  ut_fold_binary(data, len));
+		}
+	}
+
+	if (n_bytes > 0) {
+		field = dtuple_get_nth_field(tuple, i);
+
+		data = (const byte*) dfield_get_data(field);
+		len = dfield_get_len(field);
+
+		if (len != UNIV_SQL_NULL) {
+			if (len > n_bytes) {
+				len = n_bytes;
+			}
+
+			fold = ut_fold_ulint_pair(fold,
+						  ut_fold_binary(data, len));
+		}
+	}
+
+	return(fold);
+}
+
+/**********************************************************************//**
+Writes an SQL null field full of zeros. */
+UNIV_INLINE
+void
+data_write_sql_null(
+/*================*/
+	byte*	data,	/*!< in: pointer to a buffer of size len */
+	ulint	len)	/*!< in: SQL null size in bytes */
+{
+	memset(data, 0, len);
+}
+
+/**********************************************************************//**
+Checks if a dtuple contains an SQL null value.
+@return TRUE if some field is SQL null */
+UNIV_INLINE
+ibool
+dtuple_contains_null(
+/*=================*/
+	const dtuple_t*	tuple)	/*!< in: dtuple */
+{
+	ulint	n;
+	ulint	i;
+
+	n = dtuple_get_n_fields(tuple);
+
+	for (i = 0; i < n; i++) {
+		if (dfield_is_null(dtuple_get_nth_field(tuple, i))) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/**************************************************************//**
+Frees the memory in a big rec vector. */
+UNIV_INLINE
+void
+dtuple_big_rec_free(
+/*================*/
+	big_rec_t*	vector)	/*!< in, own: big rec vector; it is
+				freed in this function */
+{
+	mem_heap_free(vector->heap);
+}
diff --git a/storage/innobase/include/data0type.h b/storage/innobase/include/data0type.h
new file mode 100644
index 00000000..3d63ddb7
--- /dev/null
+++ b/storage/innobase/include/data0type.h
@@ -0,0 +1,591 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/data0type.h
+Data types
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "univ.i"
+
+/** Special length indicating a missing instantly added column */
+#define UNIV_SQL_DEFAULT (UNIV_SQL_NULL - 1)
+
+/** @return whether a length is actually stored in a field */
+#define len_is_stored(len) (len != UNIV_SQL_NULL && len != UNIV_SQL_DEFAULT)
+
+extern ulint	data_mysql_default_charset_coll;
+#define DATA_MYSQL_BINARY_CHARSET_COLL 63
+
+/* SQL data type struct */
+struct dtype_t;
+
+/** SQL Like operator comparison types */
+enum ib_like_t {
+	IB_LIKE_EXACT,	/**< e.g.  STRING */
+	IB_LIKE_PREFIX	/**< e.g., STRING% */
+};
+
+/*-------------------------------------------*/
+/* The 'MAIN TYPE' of a column */
+#define DATA_MISSING	0	/* missing column */
+#define	DATA_VARCHAR	1	/* character varying of the
+				latin1_swedish_ci charset-collation; note
+				that the MySQL format for this, DATA_BINARY,
+				DATA_VARMYSQL, is also affected by whether the
+				'precise type' contains
+				DATA_MYSQL_TRUE_VARCHAR */
+#define DATA_CHAR	2	/* fixed length character of the
+				latin1_swedish_ci charset-collation */
+#define DATA_FIXBINARY	3	/* binary string of fixed length */
+#define DATA_BINARY	4	/* binary string */
+#define DATA_BLOB	5	/* binary large object, or a TEXT type;
+				if prtype & DATA_BINARY_TYPE == 0, then this is
+				actually a TEXT column (or a BLOB created
+				with < 4.0.14; since column prefix indexes
+				came only in 4.0.14, the missing flag in BLOBs
+				created before that does not cause any harm) */
+#define	DATA_INT	6	/* integer: can be any size 1 - 8 bytes */
+#define	DATA_SYS_CHILD	7	/* address of the child page in node pointer */
+#define	DATA_SYS	8	/* system column */
+
+/* Data types >= DATA_FLOAT must be compared using the whole field, not as
+binary strings */
+
+#define DATA_FLOAT	9
+#define DATA_DOUBLE	10
+#define DATA_DECIMAL	11	/* decimal number stored as an ASCII string */
+#define	DATA_VARMYSQL	12	/* any charset varying length char */
+#define	DATA_MYSQL	13	/* any charset fixed length char */
+				/* NOTE that 4.1.1 used DATA_MYSQL and
+				DATA_VARMYSQL for all character sets, and the
+				charset-collation for tables created with it
+				can also be latin1_swedish_ci */
+
+/* DATA_GEOMETRY includes all standard geometry datatypes as described in
+OGC standard(point, line_string, polygon, multi_point, multi_polygon,
+multi_line_string, geometry_collection, geometry).
+Currently, geometry data is stored in the standard Well-Known Binary(WKB)
+format (http://www.opengeospatial.org/standards/sfa).
+We use BLOB as the underlying datatype. */
+#define DATA_GEOMETRY	14	/* geometry datatype of variable length */
+#define DATA_MTYPE_MAX	63	/* dtype_store_for_order_and_null_size()
+				requires the values are <= 63 */
+
+#define DATA_MTYPE_CURRENT_MIN	DATA_VARCHAR	/* minimum value of mtype */
+#define DATA_MTYPE_CURRENT_MAX	DATA_GEOMETRY	/* maximum value of mtype */
+/*-------------------------------------------*/
+/* The 'PRECISE TYPE' of a column */
+/*
+Tables created by a MySQL user have the following convention:
+
+- In the least significant byte in the precise type we store the MySQL type
+code (not applicable for system columns).
+
+- In the second least significant byte we OR flags DATA_NOT_NULL,
+DATA_UNSIGNED, DATA_BINARY_TYPE.
+
+- In the third least significant byte of the precise type of string types we
+store the MySQL charset-collation code. In DATA_BLOB columns created with
+< 4.0.14 we do not actually know if it is a BLOB or a TEXT column. Since there
+are no indexes on prefixes of BLOB or TEXT columns in < 4.0.14, this is no
+problem, though.
+
+Note that versions < 4.1.2 or < 5.0.1 did not store the charset code to the
+precise type, since the charset was always the default charset of the MySQL
+installation. If the stored charset code is 0 in the system table SYS_COLUMNS
+of InnoDB, that means that the default charset of this MySQL installation
+should be used.
+
+When loading a table definition from the system tables to the InnoDB data
+dictionary cache in main memory, InnoDB versions >= 4.1.2 and >= 5.0.1 check
+if the stored charset-collation is 0, and if that is the case and the type is
+a non-binary string, replace that 0 by the default charset-collation code of
+this MySQL installation. In short, in old tables, the charset-collation code
+in the system tables on disk can be 0, but in in-memory data structures
+(dtype_t), the charset-collation code is always != 0 for non-binary string
+types.
+
+In new tables, in binary string types, the charset-collation code is the
+MySQL code for the 'binary charset', that is, != 0.
+
+For binary string types and for DATA_CHAR, DATA_VARCHAR, and for those
+DATA_BLOB which are binary or have the charset-collation latin1_swedish_ci,
+InnoDB performs all comparisons internally, without resorting to the MySQL
+comparison functions. This is to save CPU time.
+
+InnoDB's own internal system tables have different precise types for their
+columns, and for them the precise type is usually not used at all.
+*/
+
+#define DATA_ENGLISH	4	/* English language character string: this
+				is a relic from pre-MySQL time and only used
+				for InnoDB's own system tables */
+#define DATA_ERROR	111	/* another relic from pre-MySQL time */
+
+#define DATA_MYSQL_TYPE_MASK 255U/* AND with this mask to extract the MySQL
+				 type from the precise type */
+#define DATA_MYSQL_TRUE_VARCHAR 15 /* MySQL type code for the >= 5.0.3
+				   format true VARCHAR */
+
+/* Precise data types for system columns and the length of those columns;
+NOTE: the values must run from 0 up in the order given! All codes must
+be less than 256 */
+#define	DATA_ROW_ID	0	/* row id: a 48-bit integer */
+#define DATA_ROW_ID_LEN	6	/* stored length for row id */
+
+#define DATA_TRX_ID	1	/* transaction id: 6 bytes */
+#define DATA_TRX_ID_LEN	6
+
+#define	DATA_ROLL_PTR	2	/* rollback data pointer: 7 bytes */
+#define DATA_ROLL_PTR_LEN 7
+
+#define	DATA_N_SYS_COLS 3	/* number of system columns defined above */
+
+#define DATA_FTS_DOC_ID	3	/* Used as FTS DOC ID column */
+
+#define DATA_SYS_PRTYPE_MASK 0xFU /* mask to extract the above from prtype */
+
+/* Flags ORed to the precise data type */
+#define DATA_NOT_NULL	256U	/* this is ORed to the precise type when
+				the column is declared as NOT NULL */
+#define DATA_UNSIGNED	512U	/* this id ORed to the precise type when
+				we have an unsigned integer type */
+#define	DATA_BINARY_TYPE 1024U	/* if the data type is a binary character
+				string, this is ORed to the precise type:
+				this only holds for tables created with
+				>= MySQL-4.0.14 */
+/* #define	DATA_NONLATIN1	2048 This is a relic from < 4.1.2 and < 5.0.1.
+				In earlier versions this was set for some
+				BLOB columns.
+*/
+#define DATA_GIS_MBR	2048U	/* Used as GIS MBR column */
+/** the size of a GIS maximum bounding rectangle */
+constexpr uint8_t DATA_MBR_LEN= uint8_t(SPDIMS * 2 * sizeof(double));
+
+#define	DATA_LONG_TRUE_VARCHAR 4096U	/* this is ORed to the precise data
+				type when the column is true VARCHAR where
+				MySQL uses 2 bytes to store the data len;
+				for shorter VARCHARs MySQL uses only 1 byte */
+#define	DATA_VIRTUAL	8192U	/* Virtual column */
+
+/** System Versioning */
+#define DATA_VERS_START	16384U	/* start system field */
+#define DATA_VERS_END	32768U	/* end system field */
+/** system-versioned user data column */
+#define DATA_VERSIONED (DATA_VERS_START|DATA_VERS_END)
+
+/*-------------------------------------------*/
+
+/* This many bytes we need to store the type information affecting the
+alphabetical order for a single field and decide the storage size of an
+SQL null*/
+#define DATA_ORDER_NULL_TYPE_BUF_SIZE		4
+/* In the >= 4.1.x storage format we add 2 bytes more so that we can also
+store the charset-collation number; one byte is left unused, though */
+#define DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE	6
+
+/* Maximum multi-byte character length in bytes, plus 1 */
+#define DATA_MBMAX	8
+
+/* For checking if mtype is GEOMETRY datatype */
+#define DATA_GEOMETRY_MTYPE(mtype)	((mtype) == DATA_GEOMETRY)
+
+/* For checking if mtype is BLOB or GEOMETRY, since we use BLOB as
+the underlying datatype of GEOMETRY data. */
+#define DATA_LARGE_MTYPE(mtype) ((mtype) == DATA_BLOB			\
+				 || (mtype) == DATA_GEOMETRY)
+
+/* For checking if data type is big length data type. */
+#define DATA_BIG_LEN_MTYPE(len, mtype) ((len) > 255 || DATA_LARGE_MTYPE(mtype))
+
+/* For checking if the column is a big length column. */
+#define DATA_BIG_COL(col) DATA_BIG_LEN_MTYPE((col)->len, (col)->mtype)
+
+/* For checking if data type is large binary data type. */
+#define DATA_LARGE_BINARY(mtype,prtype) ((mtype) == DATA_GEOMETRY || \
+	((mtype) == DATA_BLOB && !((prtype) & DATA_BINARY_TYPE)))
+
+/* We now support 15 bits (up to 32767) collation number */
+#define MAX_CHAR_COLL_NUM	32767
+
+/* Mask to get the Charset Collation number (0x7fff) */
+#define CHAR_COLL_MASK		MAX_CHAR_COLL_NUM
+
+/*********************************************************************//**
+Gets the MySQL type code from a dtype.
+@return MySQL type code; this is NOT an InnoDB type code! */
+UNIV_INLINE
+ulint
+dtype_get_mysql_type(
+/*=================*/
+	const dtype_t*	type);	/*!< in: type struct */
+/*********************************************************************//**
+Determine how many bytes the first n characters of the given string occupy.
+If the string is shorter than n characters, returns the number of bytes
+the characters in the string occupy.
+@return length of the prefix, in bytes */
+ulint
+dtype_get_at_most_n_mbchars(
+/*========================*/
+	ulint		prtype,		/*!< in: precise type */
+	ulint		mbminlen,	/*!< in: minimum length of
+					a multi-byte character, in bytes */
+	ulint		mbmaxlen,	/*!< in: maximum length of
+					a multi-byte character, in bytes */
+	ulint		prefix_len,	/*!< in: length of the requested
+					prefix, in characters, multiplied by
+					dtype_get_mbmaxlen(dtype) */
+	ulint		data_len,	/*!< in: length of str (in bytes) */
+	const char*	str);		/*!< in: the string whose prefix
+					length is being determined */
+/** @return whether main type is a string type */
+inline bool dtype_is_string_type(ulint mtype)
+{
+	return mtype <= DATA_BLOB
+		|| mtype == DATA_MYSQL || mtype == DATA_VARMYSQL;
+}
+
+/** @return whether a type is a binary string type */
+inline bool dtype_is_binary_string_type(ulint mtype, ulint prtype)
+{
+	/* Note that for tables created before MySQL 4.0.14,
+	we do not know if a DATA_BLOB column is a BLOB or a TEXT column.
+	For those DATA_BLOB columns we return false. */
+
+	return mtype == DATA_FIXBINARY || mtype == DATA_BINARY
+		|| (mtype == DATA_BLOB && (prtype & DATA_BINARY_TYPE));
+}
+
+/** @return whether a type is a non-binary string type */
+inline bool dtype_is_non_binary_string_type(ulint mtype, ulint prtype)
+{
+	return dtype_is_string_type(mtype)
+		&& !dtype_is_binary_string_type(mtype, prtype);
+}
+
+/*********************************************************************//**
+Sets a data type structure. */
+UNIV_INLINE
+void
+dtype_set(
+/*======*/
+	dtype_t*	type,	/*!< in: type struct to init */
+	ulint		mtype,	/*!< in: main data type */
+	ulint		prtype,	/*!< in: precise type */
+	ulint		len);	/*!< in: precision of type */
+/*********************************************************************//**
+Copies a data type structure. */
+UNIV_INLINE
+void
+dtype_copy(
+/*=======*/
+	dtype_t*	type1,	/*!< in: type struct to copy to */
+	const dtype_t*	type2);	/*!< in: type struct to copy from */
+/*********************************************************************//**
+Gets the SQL main data type.
+@return SQL main data type */
+UNIV_INLINE
+ulint
+dtype_get_mtype(
+/*============*/
+	const dtype_t*	type);	/*!< in: data type */
+/*********************************************************************//**
+Gets the precise data type.
+@return precise data type */
+UNIV_INLINE
+ulint
+dtype_get_prtype(
+/*=============*/
+	const dtype_t*	type);	/*!< in: data type */
+
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+void
+dtype_get_mblen(
+/*============*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type (and collation) */
+	unsigned* mbminlen,	/*!< out: minimum length of a
+				multi-byte character */
+	unsigned* mbmaxlen);	/*!< out: maximum length of a
+				multi-byte character */
+/**
+Get the charset-collation code for string types.
+@param  prtype  InnoDB precise type
+@return charset-collation code */
+inline uint16_t dtype_get_charset_coll(ulint prtype)
+{
+  return static_cast<uint16_t>(prtype >> 16) & CHAR_COLL_MASK;
+}
+
+/** Form a precise type from the < 4.1.2 format precise type plus the
+charset-collation code.
+@param[in]	old_prtype	MySQL type code and the flags
+				DATA_BINARY_TYPE etc.
+@param[in]	charset_coll	character-set collation code
+@return precise type, including the charset-collation code */
+UNIV_INLINE
+uint32_t
+dtype_form_prtype(ulint old_prtype, ulint charset_coll)
+{
+	ut_ad(old_prtype < 256 * 256);
+	ut_ad(charset_coll <= MAX_CHAR_COLL_NUM);
+	return(uint32_t(old_prtype + (charset_coll << 16)));
+}
+
+/*********************************************************************//**
+Determines if a MySQL string type is a subset of UTF-8.  This function
+may return false negatives, in case further character-set collation
+codes are introduced in MySQL later.
+@return whether a subset of UTF-8 */
+UNIV_INLINE
+bool
+dtype_is_utf8(
+/*==========*/
+	ulint	prtype);/*!< in: precise data type */
+/*********************************************************************//**
+Gets the type length.
+@return fixed length of the type, in bytes, or 0 if variable-length */
+UNIV_INLINE
+ulint
+dtype_get_len(
+/*==========*/
+	const dtype_t*	type);	/*!< in: data type */
+
+/*********************************************************************//**
+Gets the minimum length of a character, in bytes.
+@return minimum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbminlen(
+/*===============*/
+	const dtype_t*	type);	/*!< in: type */
+/*********************************************************************//**
+Gets the maximum length of a character, in bytes.
+@return maximum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbmaxlen(
+/*===============*/
+	const dtype_t*	type);	/*!< in: type */
+/***********************************************************************//**
+Returns the size of a fixed size data type, 0 if not a fixed size type.
+@return fixed size, or 0 */
+UNIV_INLINE
+unsigned
+dtype_get_fixed_size_low(
+/*=====================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type */
+	ulint	len,		/*!< in: length */
+	ulint	mbminlen,	/*!< in: minimum length of a
+				multibyte character, in bytes */
+	ulint	mbmaxlen,	/*!< in: maximum length of a
+				multibyte character, in bytes */
+	ulint	comp);		/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+
+/***********************************************************************//**
+Returns the minimum size of a data type.
+@return minimum size */
+UNIV_INLINE
+unsigned
+dtype_get_min_size_low(
+/*===================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type */
+	ulint	len,		/*!< in: length */
+	ulint	mbminlen,	/*!< in: minimum length of a character */
+	ulint	mbmaxlen);	/*!< in: maximum length of a character */
+/***********************************************************************//**
+Returns the maximum size of a data type. Note: types in system tables may be
+incomplete and return incorrect information.
+@return maximum size */
+UNIV_INLINE
+ulint
+dtype_get_max_size_low(
+/*===================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	len);		/*!< in: length */
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dtype_get_sql_null_size(
+/*====================*/
+	const dtype_t*	type,	/*!< in: type */
+	ulint		comp);	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. */
+UNIV_INLINE
+void
+dtype_read_for_order_and_null_size(
+/*===============================*/
+	dtype_t*	type,	/*!< in: type struct */
+	const byte*	buf);	/*!< in: buffer for the stored order info */
+/**********************************************************************//**
+Stores for a type the information which determines its alphabetical ordering
+and the storage size of an SQL NULL value. This is the >= 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_store_for_order_and_null_size(
+/*====================================*/
+	byte*		buf,	/*!< in: buffer for
+				DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+				bytes where we store the info */
+	const dtype_t*	type,	/*!< in: type struct */
+	ulint		prefix_len);/*!< in: prefix length to
+				replace type->len, or 0 */
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_read_for_order_and_null_size(
+/*===================================*/
+	dtype_t*	type,	/*!< in: type struct */
+	const byte*	buf);	/*!< in: buffer for stored type order info */
+
+/*********************************************************************//**
+Validates a data type structure.
+@return TRUE if ok */
+ibool
+dtype_validate(
+/*===========*/
+	const dtype_t*	type);	/*!< in: type struct to validate */
+#ifdef UNIV_DEBUG
+/** Print a data type structure.
+@param[in]	type	data type */
+void
+dtype_print(
+	const dtype_t*	type);
+#endif /* UNIV_DEBUG */
+
+struct dict_col_t;
+
+/* Structure for an SQL data type.
+If you add fields to this structure, be sure to initialize them everywhere.
+This structure is initialized in the following functions:
+dtype_set()
+dtype_read_for_order_and_null_size()
+dtype_new_read_for_order_and_null_size()
+sym_tab_add_null_lit() */
+
+struct dtype_t{
+	unsigned	prtype:32;	/*!< precise type; MySQL data
+					type, charset code, flags to
+					indicate nullability,
+					signedness, whether this is a
+					binary string, whether this is
+					a true VARCHAR where MySQL
+					uses 2 bytes to store the length */
+	unsigned	mtype:8;	/*!< main data type */
+
+	/* the remaining fields do not affect alphabetical ordering: */
+
+	unsigned	len:16;		/*!< length; for MySQL data this
+					is field->pack_length(),
+					except that for a >= 5.0.3
+					type true VARCHAR this is the
+					maximum byte length of the
+					string data (in addition to
+					the string, MySQL uses 1 or 2
+					bytes to store the string length) */
+	unsigned	mbminlen:3;	/*!< minimum length of a character,
+					in bytes */
+	unsigned	mbmaxlen:3;	/*!< maximum length of a character,
+					in bytes */
+
+	/** @return whether this is system versioned user field */
+	bool is_versioned() const { return !(~prtype & DATA_VERSIONED); }
+	/** @return whether this is the system field start */
+	bool vers_sys_start() const
+	{
+		return (prtype & DATA_VERSIONED) == DATA_VERS_START;
+	}
+	/** @return whether this is the system field end */
+	bool vers_sys_end() const
+	{
+		return (prtype & DATA_VERSIONED) == DATA_VERS_END;
+	}
+
+	/** Set the type of the BLOB in the hidden metadata record. */
+	void metadata_blob_init()
+	{
+		prtype = DATA_NOT_NULL;
+		mtype = DATA_BLOB;
+		len = 0;
+		mbminlen = 0;
+		mbmaxlen = 0;
+	}
+
+	/** Copy the type information from a column.
+	@param col column type to be copied */
+	void assign(const dict_col_t &col);
+};
+
+/** The DB_TRX_ID,DB_ROLL_PTR values for "no history is available" */
+extern const byte reset_trx_id[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
+
+/** Info bit denoting the predefined minimum record: this bit is set
+if and only if the record is the first user record on a non-leaf
+B-tree page that is the leftmost page on its level
+(PAGE_LEVEL is nonzero and FIL_PAGE_PREV is FIL_NULL). */
+#define REC_INFO_MIN_REC_FLAG	0x10UL
+/** The delete-mark flag in info bits */
+#define REC_INFO_DELETED_FLAG	0x20UL
+
+/** Record status values for ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED */
+enum rec_comp_status_t {
+	/** User record (PAGE_LEVEL=0, heap>=PAGE_HEAP_NO_USER_LOW) */
+	REC_STATUS_ORDINARY = 0,
+	/** Node pointer record (PAGE_LEVEL>=0, heap>=PAGE_HEAP_NO_USER_LOW) */
+	REC_STATUS_NODE_PTR = 1,
+	/** The page infimum pseudo-record (heap=PAGE_HEAP_NO_INFIMUM) */
+	REC_STATUS_INFIMUM = 2,
+	/** The page supremum pseudo-record (heap=PAGE_HEAP_NO_SUPREMUM) */
+	REC_STATUS_SUPREMUM = 3,
+	/** Clustered index record that has been inserted or updated
+	after instant ADD COLUMN (more than dict_index_t::n_core_fields) */
+	REC_STATUS_INSTANT = 4
+};
+
+/** The dtuple_t::info_bits of the hidden metadata of instant ADD COLUMN.
+@see rec_is_metadata()
+@see rec_is_alter_metadata() */
+static const byte REC_INFO_METADATA_ADD
+	= REC_INFO_MIN_REC_FLAG | REC_STATUS_INSTANT;
+
+/** The dtuple_t::info_bits of the hidden metadata of instant ALTER TABLE.
+@see rec_is_metadata() */
+static const byte REC_INFO_METADATA_ALTER
+	= REC_INFO_METADATA_ADD | REC_INFO_DELETED_FLAG;
+
+#include "data0type.inl"
diff --git a/storage/innobase/include/data0type.inl b/storage/innobase/include/data0type.inl
new file mode 100644
index 00000000..329cee5d
--- /dev/null
+++ b/storage/innobase/include/data0type.inl
@@ -0,0 +1,487 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/data0type.ic
+Data types
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#include "ha_prototypes.h"
+
+/*********************************************************************//**
+Determines if a MySQL string type is a subset of UTF-8.  This function
+may return false negatives, in case further character-set collation
+codes are introduced in MySQL later.
+@return whether a subset of UTF-8 */
+UNIV_INLINE
+bool
+dtype_is_utf8(
+/*==========*/
+	ulint	prtype)	/*!< in: precise data type */
+{
+	/* These codes have been copied from strings/ctype-extra.c
+	and strings/ctype-utf8.c. */
+	switch (dtype_get_charset_coll(prtype)) {
+	case 11: /* ascii_general_ci */
+	case 65: /* ascii_bin */
+	case 33: /* utf8_general_ci */
+	case 83: /* utf8_bin */
+	case 254: /* utf8_general_cs */
+		return true;
+	}
+
+	return false;
+}
+
+/*********************************************************************//**
+Gets the MySQL type code from a dtype.
+@return MySQL type code; this is NOT an InnoDB type code! */
+UNIV_INLINE
+ulint
+dtype_get_mysql_type(
+/*=================*/
+	const dtype_t*	type)	/*!< in: type struct */
+{
+	return(type->prtype & 0xFFUL);
+}
+
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+UNIV_INLINE
+void
+dtype_set_mblen(
+/*============*/
+	dtype_t*	type)	/*!< in/out: type */
+{
+	unsigned mbminlen, mbmaxlen;
+
+	dtype_get_mblen(type->mtype, type->prtype, &mbminlen, &mbmaxlen);
+	type->mbminlen = mbminlen & 7;
+	type->mbmaxlen = mbmaxlen & 7;
+
+	ut_ad(dtype_validate(type));
+}
+
+/*********************************************************************//**
+Sets a data type structure. */
+UNIV_INLINE
+void
+dtype_set(
+/*======*/
+	dtype_t*	type,	/*!< in: type struct to init */
+	ulint		mtype,	/*!< in: main data type */
+	ulint		prtype,	/*!< in: precise type */
+	ulint		len)	/*!< in: precision of type */
+{
+	ut_ad(type);
+	ut_ad(mtype <= DATA_MTYPE_MAX);
+
+	type->mtype = static_cast<byte>(mtype);
+	type->prtype = static_cast<unsigned>(prtype);
+	type->len = static_cast<uint16_t>(len);
+
+	dtype_set_mblen(type);
+}
+
+/*********************************************************************//**
+Copies a data type structure. */
+UNIV_INLINE
+void
+dtype_copy(
+/*=======*/
+	dtype_t*	type1,	/*!< in: type struct to copy to */
+	const dtype_t*	type2)	/*!< in: type struct to copy from */
+{
+	*type1 = *type2;
+
+	ut_ad(dtype_validate(type1));
+}
+
+/*********************************************************************//**
+Gets the SQL main data type.
+@return SQL main data type */
+UNIV_INLINE
+ulint
+dtype_get_mtype(
+/*============*/
+	const dtype_t*	type)	/*!< in: data type */
+{
+	ut_ad(type);
+
+	return(type->mtype);
+}
+
+/*********************************************************************//**
+Gets the precise data type.
+@return precise data type */
+UNIV_INLINE
+ulint
+dtype_get_prtype(
+/*=============*/
+	const dtype_t*	type)	/*!< in: data type */
+{
+	ut_ad(type);
+
+	return(type->prtype);
+}
+
+/*********************************************************************//**
+Gets the type length.
+@return fixed length of the type, in bytes, or 0 if variable-length */
+UNIV_INLINE
+ulint
+dtype_get_len(
+/*==========*/
+	const dtype_t*	type)	/*!< in: data type */
+{
+	ut_ad(type);
+
+	return(type->len);
+}
+
+/*********************************************************************//**
+Gets the minimum length of a character, in bytes.
+@return minimum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbminlen(
+/*===============*/
+	const dtype_t*	type)	/*!< in: type */
+{
+	return type->mbminlen;
+}
+/*********************************************************************//**
+Gets the maximum length of a character, in bytes.
+@return maximum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbmaxlen(
+/*===============*/
+	const dtype_t*	type)	/*!< in: type */
+{
+	return type->mbmaxlen;
+}
+
+/**********************************************************************//**
+Stores for a type the information which determines its alphabetical ordering
+and the storage size of an SQL NULL value. This is the >= 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_store_for_order_and_null_size(
+/*====================================*/
+	byte*		buf,	/*!< in: buffer for
+				DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+				bytes where we store the info */
+	const dtype_t*	type,	/*!< in: type struct */
+	ulint		prefix_len)/*!< in: prefix length to
+				replace type->len, or 0 */
+{
+	compile_time_assert(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+	ulint	len;
+
+	ut_ad(type);
+	ut_ad(type->mtype >= DATA_VARCHAR);
+	ut_ad(type->mtype <= DATA_MTYPE_MAX);
+
+	buf[0] = (byte)(type->mtype & 0xFFUL);
+
+	if (type->prtype & DATA_BINARY_TYPE) {
+		buf[0] |= 128;
+	}
+
+	/* In versions < 4.1.2 we had:	if (type->prtype & DATA_NONLATIN1) {
+	buf[0] |= 64;
+	}
+	*/
+
+	buf[1] = (byte)(type->prtype & 0xFFUL);
+
+	len = prefix_len ? prefix_len : type->len;
+
+	mach_write_to_2(buf + 2, len & 0xFFFFUL);
+
+	ut_ad(dtype_get_charset_coll(type->prtype) <= MAX_CHAR_COLL_NUM);
+	mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype));
+
+	if (type->prtype & DATA_NOT_NULL) {
+		buf[4] |= 128;
+	}
+}
+
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the < 4.1.x
+storage format. */
+UNIV_INLINE
+void
+dtype_read_for_order_and_null_size(
+/*===============================*/
+	dtype_t*	type,	/*!< in: type struct */
+	const byte*	buf)	/*!< in: buffer for stored type order info */
+{
+	compile_time_assert(4 == DATA_ORDER_NULL_TYPE_BUF_SIZE);
+	type->mtype = buf[0] & 63;
+	type->prtype = buf[1];
+
+	if (buf[0] & 128) {
+		type->prtype |= DATA_BINARY_TYPE;
+	}
+
+	type->len = mach_read_from_2(buf + 2);
+
+	type->prtype = dtype_form_prtype(type->prtype,
+					 data_mysql_default_charset_coll);
+	dtype_set_mblen(type);
+}
+
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the >= 4.1.x
+storage format. */
+UNIV_INLINE
+void
+dtype_new_read_for_order_and_null_size(
+/*===================================*/
+	dtype_t*	type,	/*!< in: type struct */
+	const byte*	buf)	/*!< in: buffer for stored type order info */
+{
+	compile_time_assert(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+	type->mtype = buf[0] & 63;
+	type->prtype = buf[1];
+
+	if (buf[0] & 128) {
+		type->prtype |= DATA_BINARY_TYPE;
+	}
+
+	if (buf[4] & 128) {
+		type->prtype |= DATA_NOT_NULL;
+	}
+
+	type->len = mach_read_from_2(buf + 2);
+
+	ulint charset_coll = mach_read_from_2(buf + 4) & CHAR_COLL_MASK;
+
+	if (dtype_is_string_type(type->mtype)) {
+		ut_a(charset_coll <= MAX_CHAR_COLL_NUM);
+
+		if (charset_coll == 0) {
+			/* This insert buffer record was inserted with MySQL
+			version < 4.1.2, and the charset-collation code was not
+			explicitly stored to dtype->prtype at that time. It
+			must be the default charset-collation of this MySQL
+			installation. */
+
+			charset_coll = data_mysql_default_charset_coll;
+		}
+
+		type->prtype = dtype_form_prtype(type->prtype, charset_coll);
+	}
+	dtype_set_mblen(type);
+}
+
+/***********************************************************************//**
+Returns the size of a fixed size data type, 0 if not a fixed size type.
+@return fixed size, or 0 */
+UNIV_INLINE
+unsigned
+dtype_get_fixed_size_low(
+/*=====================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type */
+	ulint	len,		/*!< in: length */
+	ulint	mbminlen,	/*!< in: minimum length of a
+				multibyte character, in bytes */
+	ulint	mbmaxlen,	/*!< in: maximum length of a
+				multibyte character, in bytes */
+	ulint	comp)		/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+{
+	switch (mtype) {
+	case DATA_SYS:
+#ifdef UNIV_DEBUG
+		switch (prtype & DATA_MYSQL_TYPE_MASK) {
+		case DATA_ROW_ID:
+			ut_ad(len == DATA_ROW_ID_LEN);
+			break;
+		case DATA_TRX_ID:
+			ut_ad(len == DATA_TRX_ID_LEN);
+			break;
+		case DATA_ROLL_PTR:
+			ut_ad(len == DATA_ROLL_PTR_LEN);
+			break;
+		default:
+			ut_ad(0);
+			return(0);
+		}
+#endif /* UNIV_DEBUG */
+		/* fall through */
+	case DATA_CHAR:
+	case DATA_FIXBINARY:
+	case DATA_INT:
+	case DATA_FLOAT:
+	case DATA_DOUBLE:
+		return static_cast<unsigned>(len);
+	case DATA_MYSQL:
+		if (prtype & DATA_BINARY_TYPE) {
+			return static_cast<unsigned>(len);
+		} else if (!comp) {
+			return static_cast<unsigned>(len);
+		} else {
+			if (mbminlen == mbmaxlen) {
+				return static_cast<unsigned>(len);
+			}
+		}
+		/* Treat as variable-length. */
+		/* fall through */
+	case DATA_VARCHAR:
+	case DATA_BINARY:
+	case DATA_DECIMAL:
+	case DATA_VARMYSQL:
+	case DATA_GEOMETRY:
+	case DATA_BLOB:
+		return(0);
+	default:
+		ut_error;
+	}
+
+	return(0);
+}
+
+/***********************************************************************//**
+Returns the minimum size of a data type.
+@return minimum size */
+UNIV_INLINE
+unsigned
+dtype_get_min_size_low(
+/*===================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type */
+	ulint	len,		/*!< in: length */
+	ulint	mbminlen,	/*!< in: minimum length of a character */
+	ulint	mbmaxlen)	/*!< in: maximum length of a character */
+{
+	switch (mtype) {
+	case DATA_SYS:
+#ifdef UNIV_DEBUG
+		switch (prtype & DATA_MYSQL_TYPE_MASK) {
+		case DATA_ROW_ID:
+			ut_ad(len == DATA_ROW_ID_LEN);
+			break;
+		case DATA_TRX_ID:
+			ut_ad(len == DATA_TRX_ID_LEN);
+			break;
+		case DATA_ROLL_PTR:
+			ut_ad(len == DATA_ROLL_PTR_LEN);
+			break;
+		default:
+			ut_ad(0);
+			return(0);
+		}
+#endif /* UNIV_DEBUG */
+		/* fall through */
+	case DATA_CHAR:
+	case DATA_FIXBINARY:
+	case DATA_INT:
+	case DATA_FLOAT:
+	case DATA_DOUBLE:
+		return static_cast<unsigned>(len);
+	case DATA_MYSQL:
+		if (prtype & DATA_BINARY_TYPE) {
+			return static_cast<unsigned>(len);
+		} else {
+			if (mbminlen == mbmaxlen) {
+				return static_cast<unsigned>(len);
+			}
+
+			/* this is a variable-length character set */
+			ut_a(mbminlen > 0);
+			ut_a(mbmaxlen > mbminlen);
+			ut_a(len % mbmaxlen == 0);
+			return static_cast<unsigned>(
+				len * mbminlen / mbmaxlen);
+		}
+	case DATA_VARCHAR:
+	case DATA_BINARY:
+	case DATA_DECIMAL:
+	case DATA_VARMYSQL:
+	case DATA_GEOMETRY:
+	case DATA_BLOB:
+		return(0);
+	default:
+		ut_error;
+	}
+
+	return(0);
+}
+
+/***********************************************************************//**
+Returns the maximum size of a data type. Note: types in system tables may be
+incomplete and return incorrect information.
+@return maximum size */
+UNIV_INLINE
+ulint
+dtype_get_max_size_low(
+/*===================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	len)		/*!< in: length */
+{
+	switch (mtype) {
+	case DATA_SYS:
+	case DATA_CHAR:
+	case DATA_FIXBINARY:
+	case DATA_INT:
+	case DATA_FLOAT:
+	case DATA_DOUBLE:
+	case DATA_MYSQL:
+	case DATA_VARCHAR:
+	case DATA_BINARY:
+	case DATA_DECIMAL:
+	case DATA_VARMYSQL:
+		return(len);
+	case DATA_GEOMETRY:
+	case DATA_BLOB:
+		break;
+	default:
+		ut_error;
+	}
+
+	return(ULINT_MAX);
+}
+
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dtype_get_sql_null_size(
+/*====================*/
+	const dtype_t*	type,	/*!< in: type */
+	ulint		comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+{
+	return(dtype_get_fixed_size_low(type->mtype, type->prtype, type->len,
+					type->mbminlen, type->mbmaxlen, comp));
+}
diff --git a/storage/innobase/include/data0types.h b/storage/innobase/include/data0types.h
new file mode 100644
index 00000000..bcd6b8bc
--- /dev/null
+++ b/storage/innobase/include/data0types.h
@@ -0,0 +1,36 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0types.h
+Some type definitions
+
+Created 9/21/2000 Heikki Tuuri
+*************************************************************************/
+
+#ifndef data0types_h
+#define data0types_h
+
+/* SQL data field struct */
+struct dfield_t;
+
+/* SQL data tuple struct */
+struct dtuple_t;
+
+#endif
+
diff --git a/storage/innobase/include/db0err.h b/storage/innobase/include/db0err.h
new file mode 100644
index 00000000..64182aab
--- /dev/null
+++ b/storage/innobase/include/db0err.h
@@ -0,0 +1,170 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/db0err.h
+Global error codes for the database
+
+Created 5/24/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef db0err_h
+#define db0err_h
+
+/* Do not include univ.i because univ.i includes this. */
+
+enum dberr_t {
+	DB_SUCCESS,
+
+	DB_SUCCESS_LOCKED_REC = 9,	/*!< like DB_SUCCESS, but a new
+					explicit record lock was created */
+
+	/* The following are error codes */
+	DB_ERROR = 11,
+	DB_INTERRUPTED,
+	DB_OUT_OF_MEMORY,
+	DB_OUT_OF_FILE_SPACE,
+	DB_LOCK_WAIT,
+	DB_DEADLOCK,
+	DB_ROLLBACK,
+	DB_DUPLICATE_KEY,
+	DB_MISSING_HISTORY,		/*!< required history data has been
+					deleted due to lack of space in
+					rollback segment */
+	DB_CLUSTER_NOT_FOUND = 30,
+	DB_TABLE_NOT_FOUND,
+	DB_TOO_BIG_RECORD,		/*!< a record in an index would not fit
+					on a compressed page, or it would
+					become bigger than 1/2 free space in
+					an uncompressed page frame */
+	DB_LOCK_WAIT_TIMEOUT,		/*!< lock wait lasted too long */
+	DB_NO_REFERENCED_ROW,		/*!< referenced key value not found
+					for a foreign key in an insert or
+					update of a row */
+	DB_ROW_IS_REFERENCED,		/*!< cannot delete or update a row
+					because it contains a key value
+					which is referenced */
+	DB_CANNOT_ADD_CONSTRAINT,	/*!< adding a foreign key constraint
+					to a table failed */
+	DB_CORRUPTION,			/*!< data structure corruption
+					noticed */
+	DB_CANNOT_DROP_CONSTRAINT,	/*!< dropping a foreign key constraint
+					from a table failed */
+	DB_NO_SAVEPOINT,		/*!< no savepoint exists with the given
+					name */
+	DB_TABLESPACE_EXISTS,		/*!< we cannot create a new single-table
+					tablespace because a file of the same
+					name already exists */
+	DB_TABLESPACE_DELETED,		/*!< tablespace was deleted or is
+					being dropped right now */
+	DB_TABLESPACE_NOT_FOUND,	/*<! Attempt to delete a tablespace
+					instance that was not found in the
+					tablespace hash table */
+	DB_LOCK_TABLE_FULL,		/*!< lock structs have exhausted the
+					buffer pool (for big transactions,
+					InnoDB stores the lock structs in the
+					buffer pool) */
+	DB_FOREIGN_DUPLICATE_KEY,	/*!< foreign key constraints
+					activated by the operation would
+					lead to a duplicate key in some
+					table */
+	DB_TOO_MANY_CONCURRENT_TRXS,	/*!< when InnoDB runs out of the
+					preconfigured undo slots, this can
+					only happen when there are too many
+					concurrent transactions */
+	DB_UNSUPPORTED,			/*!< when InnoDB sees any artefact or
+					a feature that it can't recoginize or
+					work with e.g., FT indexes created by
+					a later version of the engine. */
+
+	DB_INVALID_NULL,		/*!< a NOT NULL column was found to
+					be NULL during table rebuild */
+
+	DB_STATS_DO_NOT_EXIST,		/*!< an operation that requires the
+					persistent storage, used for recording
+					table and index statistics, was
+					requested but this storage does not
+					exist itself or the stats for a given
+					table do not exist */
+	DB_FOREIGN_EXCEED_MAX_CASCADE,	/*!< Foreign key constraint related
+					cascading delete/update exceeds
+					maximum allowed depth */
+	DB_CHILD_NO_INDEX,		/*!< the child (foreign) table does
+					not have an index that contains the
+					foreign keys as its prefix columns */
+	DB_PARENT_NO_INDEX,		/*!< the parent table does not
+					have an index that contains the
+					foreign keys as its prefix columns */
+	DB_TOO_BIG_INDEX_COL,		/*!< index column size exceeds
+					maximum limit */
+	DB_INDEX_CORRUPT,		/*!< we have corrupted index */
+	DB_UNDO_RECORD_TOO_BIG,		/*!< the undo log record is too big */
+	DB_READ_ONLY,			/*!< Update operation attempted in
+					a read-only transaction */
+	DB_FTS_INVALID_DOCID,		/* FTS Doc ID cannot be zero */
+	DB_ONLINE_LOG_TOO_BIG,		/*!< Modification log grew too big
+					during online index creation */
+
+	DB_IDENTIFIER_TOO_LONG,		/*!< Identifier name too long */
+	DB_FTS_EXCEED_RESULT_CACHE_LIMIT,	/*!< FTS query memory
+					exceeds result cache limit */
+	DB_TEMP_FILE_WRITE_FAIL,	/*!< Temp file write failure */
+	DB_CANT_CREATE_GEOMETRY_OBJECT,	/*!< Cannot create specified Geometry
+					data object */
+	DB_CANNOT_OPEN_FILE,		/*!< Cannot open a file */
+	DB_FTS_TOO_MANY_WORDS_IN_PHRASE,
+					/*< Too many words in a phrase */
+
+	DB_DECRYPTION_FAILED,		/* Tablespace encrypted and
+					decrypt operation failed because
+					of missing key management plugin,
+					or missing or incorrect key or
+					incorret AES method or algorithm. */
+
+	DB_IO_ERROR = 100,		/*!< Generic IO error */
+
+	DB_IO_PARTIAL_FAILED,		/*!< Partial IO request failed */
+
+	DB_TABLE_CORRUPT,		/*!< Table/clustered index is
+					corrupted */
+
+	DB_COMPUTE_VALUE_FAILED,	/*!< Compute generated value failed */
+
+	DB_NO_FK_ON_S_BASE_COL,		/*!< Cannot add foreign constrain
+					placed on the base column of
+					stored column */
+
+	DB_IO_NO_PUNCH_HOLE,		/*!< Punch hole not supported by
+					file system. */
+
+	DB_PAGE_CORRUPTED,		/* Page read from tablespace is
+					corrupted. */
+	/* The following are partial failure codes */
+	DB_FAIL = 1000,
+	DB_OVERFLOW,
+	DB_UNDERFLOW,
+	DB_STRONG_FAIL,
+	DB_ZIP_OVERFLOW,
+	DB_RECORD_NOT_FOUND = 1500,
+	DB_END_OF_INDEX,
+	DB_NOT_FOUND,			/*!< Generic error code for "Not found"
+					type of errors */
+};
+
+#endif
diff --git a/storage/innobase/include/dict0boot.h b/storage/innobase/include/dict0boot.h
new file mode 100644
index 00000000..a6528747
--- /dev/null
+++ b/storage/innobase/include/dict0boot.h
@@ -0,0 +1,297 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0boot.h
+Data dictionary creation and booting
+
+Created 4/18/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0boot_h
+#define dict0boot_h
+
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "ut0byte.h"
+#include "buf0buf.h"
+#include "dict0dict.h"
+
+/**********************************************************************//**
+Returns a new table, index, or space id. */
+void
+dict_hdr_get_new_id(
+/*================*/
+	table_id_t*		table_id,	/*!< out: table id
+						(not assigned if NULL) */
+	index_id_t*		index_id,	/*!< out: index id
+						(not assigned if NULL) */
+	uint32_t*		space_id);	/*!< out: space id
+						(not assigned if NULL) */
+/** Update dict_sys.row_id in the dictionary header file page. */
+void dict_hdr_flush_row_id(row_id_t id);
+/** @return A new value for GEN_CLUST_INDEX(DB_ROW_ID) */
+inline row_id_t dict_sys_t::get_new_row_id()
+{
+  row_id_t id= row_id.fetch_add(1);
+  if (!(id % ROW_ID_WRITE_MARGIN))
+    dict_hdr_flush_row_id(id);
+  return id;
+}
+
+/** Ensure that row_id is not smaller than id, on IMPORT TABLESPACE */
+inline void dict_sys_t::update_row_id(row_id_t id)
+{
+  row_id_t sys_id= row_id;
+  while (id >= sys_id)
+  {
+    if (!row_id.compare_exchange_strong(sys_id, id))
+      continue;
+    if (!(id % ROW_ID_WRITE_MARGIN))
+      dict_hdr_flush_row_id(id);
+    break;
+  }
+}
+
+/**********************************************************************//**
+Writes a row id to a record or other 6-byte stored form. */
+inline void dict_sys_write_row_id(byte *field, row_id_t row_id)
+{
+  static_assert(DATA_ROW_ID_LEN == 6, "compatibility");
+  mach_write_to_6(field, row_id);
+}
+
+/*****************************************************************//**
+Initializes the data dictionary memory structures when the database is
+started. This function is also called when the data dictionary is created.
+@return DB_SUCCESS or error code. */
+dberr_t
+dict_boot(void)
+/*===========*/
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*****************************************************************//**
+Creates and initializes the data dictionary at the server bootstrap.
+@return DB_SUCCESS or error code. */
+dberr_t
+dict_create(void)
+/*=============*/
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Check if a table id belongs to  system table.
+@return true if the table id belongs to a system table. */
+inline bool dict_is_sys_table(table_id_t id) { return id < DICT_HDR_FIRST_ID; }
+
+/* Space id and page no where the dictionary header resides */
+#define	DICT_HDR_SPACE		0	/* the SYSTEM tablespace */
+#define	DICT_HDR_PAGE_NO	FSP_DICT_HDR_PAGE_NO
+
+/* The ids for the basic system tables and their indexes */
+#define DICT_TABLES_ID		1
+#define DICT_COLUMNS_ID		2
+#define DICT_INDEXES_ID		dict_index_t::DICT_INDEXES_ID /* 3 */
+#define DICT_FIELDS_ID		4
+/* The following is a secondary index on SYS_TABLES */
+#define DICT_TABLE_IDS_ID	5
+
+/* The offset of the dictionary header on the page */
+#define	DICT_HDR		FSEG_PAGE_DATA
+
+/*-------------------------------------------------------------*/
+/* Dictionary header offsets */
+#define DICT_HDR_ROW_ID		0	/* The latest assigned row id */
+#define DICT_HDR_TABLE_ID	8	/* The latest assigned table id */
+#define DICT_HDR_INDEX_ID	16	/* The latest assigned index id */
+#define DICT_HDR_MAX_SPACE_ID	24	/* The latest assigned space id,or 0*/
+#define DICT_HDR_MIX_ID_LOW	28	/* Obsolete,always DICT_HDR_FIRST_ID*/
+#define DICT_HDR_TABLES		32	/* Root of SYS_TABLES clust index */
+#define DICT_HDR_TABLE_IDS	36	/* Root of SYS_TABLE_IDS sec index */
+#define DICT_HDR_COLUMNS	40	/* Root of SYS_COLUMNS clust index */
+#define DICT_HDR_INDEXES	44	/* Root of SYS_INDEXES clust index */
+#define DICT_HDR_FIELDS		48	/* Root of SYS_FIELDS clust index */
+
+#define DICT_HDR_FSEG_HEADER	56	/* Segment header for the tablespace
+					segment into which the dictionary
+					header is created */
+/*-------------------------------------------------------------*/
+
+/* The columns in SYS_TABLES */
+enum dict_col_sys_tables_enum {
+	DICT_COL__SYS_TABLES__NAME		= 0,
+	DICT_COL__SYS_TABLES__ID		= 1,
+	DICT_COL__SYS_TABLES__N_COLS		= 2,
+	DICT_COL__SYS_TABLES__TYPE		= 3,
+	DICT_COL__SYS_TABLES__MIX_ID		= 4,
+	DICT_COL__SYS_TABLES__MIX_LEN		= 5,
+	DICT_COL__SYS_TABLES__CLUSTER_ID	= 6,
+	DICT_COL__SYS_TABLES__SPACE		= 7,
+	DICT_NUM_COLS__SYS_TABLES		= 8
+};
+/* The field numbers in the SYS_TABLES clustered index */
+enum dict_fld_sys_tables_enum {
+	DICT_FLD__SYS_TABLES__NAME		= 0,
+	DICT_FLD__SYS_TABLES__DB_TRX_ID		= 1,
+	DICT_FLD__SYS_TABLES__DB_ROLL_PTR	= 2,
+	DICT_FLD__SYS_TABLES__ID		= 3,
+	DICT_FLD__SYS_TABLES__N_COLS		= 4,
+	DICT_FLD__SYS_TABLES__TYPE		= 5,
+	DICT_FLD__SYS_TABLES__MIX_ID		= 6,
+	DICT_FLD__SYS_TABLES__MIX_LEN		= 7,
+	DICT_FLD__SYS_TABLES__CLUSTER_ID	= 8,
+	DICT_FLD__SYS_TABLES__SPACE		= 9,
+	DICT_NUM_FIELDS__SYS_TABLES		= 10
+};
+/* The field numbers in the SYS_TABLE_IDS index */
+enum dict_fld_sys_table_ids_enum {
+	DICT_FLD__SYS_TABLE_IDS__ID		= 0,
+	DICT_FLD__SYS_TABLE_IDS__NAME		= 1,
+	DICT_NUM_FIELDS__SYS_TABLE_IDS		= 2
+};
+/* The columns in SYS_COLUMNS */
+enum dict_col_sys_columns_enum {
+	DICT_COL__SYS_COLUMNS__TABLE_ID		= 0,
+	DICT_COL__SYS_COLUMNS__POS		= 1,
+	DICT_COL__SYS_COLUMNS__NAME		= 2,
+	DICT_COL__SYS_COLUMNS__MTYPE		= 3,
+	DICT_COL__SYS_COLUMNS__PRTYPE		= 4,
+	DICT_COL__SYS_COLUMNS__LEN		= 5,
+	DICT_COL__SYS_COLUMNS__PREC		= 6,
+	DICT_NUM_COLS__SYS_COLUMNS		= 7
+};
+/* The field numbers in the SYS_COLUMNS clustered index */
+enum dict_fld_sys_columns_enum {
+	DICT_FLD__SYS_COLUMNS__TABLE_ID		= 0,
+	DICT_FLD__SYS_COLUMNS__POS		= 1,
+	DICT_FLD__SYS_COLUMNS__DB_TRX_ID	= 2,
+	DICT_FLD__SYS_COLUMNS__DB_ROLL_PTR	= 3,
+	DICT_FLD__SYS_COLUMNS__NAME		= 4,
+	DICT_FLD__SYS_COLUMNS__MTYPE		= 5,
+	DICT_FLD__SYS_COLUMNS__PRTYPE		= 6,
+	DICT_FLD__SYS_COLUMNS__LEN		= 7,
+	DICT_FLD__SYS_COLUMNS__PREC		= 8,
+	DICT_NUM_FIELDS__SYS_COLUMNS		= 9
+};
+/* The columns in SYS_INDEXES */
+enum dict_col_sys_indexes_enum {
+	DICT_COL__SYS_INDEXES__TABLE_ID		= 0,
+	DICT_COL__SYS_INDEXES__ID		= 1,
+	DICT_COL__SYS_INDEXES__NAME		= 2,
+	DICT_COL__SYS_INDEXES__N_FIELDS		= 3,
+	DICT_COL__SYS_INDEXES__TYPE		= 4,
+	DICT_COL__SYS_INDEXES__SPACE		= 5,
+	DICT_COL__SYS_INDEXES__PAGE_NO		= 6,
+	DICT_COL__SYS_INDEXES__MERGE_THRESHOLD	= 7,
+	DICT_NUM_COLS__SYS_INDEXES		= 8
+};
+/* The field numbers in the SYS_INDEXES clustered index */
+enum dict_fld_sys_indexes_enum {
+	DICT_FLD__SYS_INDEXES__TABLE_ID		= 0,
+	DICT_FLD__SYS_INDEXES__ID		= 1,
+	DICT_FLD__SYS_INDEXES__DB_TRX_ID	= 2,
+	DICT_FLD__SYS_INDEXES__DB_ROLL_PTR	= 3,
+	DICT_FLD__SYS_INDEXES__NAME		= 4,
+	DICT_FLD__SYS_INDEXES__N_FIELDS		= 5,
+	DICT_FLD__SYS_INDEXES__TYPE		= 6,
+	DICT_FLD__SYS_INDEXES__SPACE		= 7,
+	DICT_FLD__SYS_INDEXES__PAGE_NO		= 8,
+	DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD	= 9,
+	DICT_NUM_FIELDS__SYS_INDEXES		= 10
+};
+/* The columns in SYS_FIELDS */
+enum dict_col_sys_fields_enum {
+	DICT_COL__SYS_FIELDS__INDEX_ID		= 0,
+	DICT_COL__SYS_FIELDS__POS		= 1,
+	DICT_COL__SYS_FIELDS__COL_NAME		= 2,
+	DICT_NUM_COLS__SYS_FIELDS		= 3
+};
+/* The field numbers in the SYS_FIELDS clustered index */
+enum dict_fld_sys_fields_enum {
+	DICT_FLD__SYS_FIELDS__INDEX_ID		= 0,
+	DICT_FLD__SYS_FIELDS__POS		= 1,
+	DICT_FLD__SYS_FIELDS__DB_TRX_ID		= 2,
+	DICT_FLD__SYS_FIELDS__DB_ROLL_PTR	= 3,
+	DICT_FLD__SYS_FIELDS__COL_NAME		= 4,
+	DICT_NUM_FIELDS__SYS_FIELDS		= 5
+};
+/* The columns in SYS_FOREIGN */
+enum dict_col_sys_foreign_enum {
+	DICT_COL__SYS_FOREIGN__ID		= 0,
+	DICT_COL__SYS_FOREIGN__FOR_NAME		= 1,
+	DICT_COL__SYS_FOREIGN__REF_NAME		= 2,
+	DICT_COL__SYS_FOREIGN__N_COLS		= 3,
+	DICT_NUM_COLS__SYS_FOREIGN		= 4
+};
+/* The field numbers in the SYS_FOREIGN clustered index */
+enum dict_fld_sys_foreign_enum {
+	DICT_FLD__SYS_FOREIGN__ID		= 0,
+	DICT_FLD__SYS_FOREIGN__DB_TRX_ID	= 1,
+	DICT_FLD__SYS_FOREIGN__DB_ROLL_PTR	= 2,
+	DICT_FLD__SYS_FOREIGN__FOR_NAME		= 3,
+	DICT_FLD__SYS_FOREIGN__REF_NAME		= 4,
+	DICT_FLD__SYS_FOREIGN__N_COLS		= 5,
+	DICT_NUM_FIELDS__SYS_FOREIGN		= 6
+};
+/* The field numbers in the SYS_FOREIGN_FOR_NAME secondary index */
+enum dict_fld_sys_foreign_for_name_enum {
+	DICT_FLD__SYS_FOREIGN_FOR_NAME__NAME	= 0,
+	DICT_FLD__SYS_FOREIGN_FOR_NAME__ID	= 1,
+	DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME	= 2
+};
+/* The columns in SYS_FOREIGN_COLS */
+enum dict_col_sys_foreign_cols_enum {
+	DICT_COL__SYS_FOREIGN_COLS__ID			= 0,
+	DICT_COL__SYS_FOREIGN_COLS__POS			= 1,
+	DICT_COL__SYS_FOREIGN_COLS__FOR_COL_NAME	= 2,
+	DICT_COL__SYS_FOREIGN_COLS__REF_COL_NAME	= 3,
+	DICT_NUM_COLS__SYS_FOREIGN_COLS			= 4
+};
+/* The field numbers in the SYS_FOREIGN_COLS clustered index */
+enum dict_fld_sys_foreign_cols_enum {
+	DICT_FLD__SYS_FOREIGN_COLS__ID			= 0,
+	DICT_FLD__SYS_FOREIGN_COLS__POS			= 1,
+	DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID		= 2,
+	DICT_FLD__SYS_FOREIGN_COLS__DB_ROLL_PTR		= 3,
+	DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME	= 4,
+	DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME	= 5,
+	DICT_NUM_FIELDS__SYS_FOREIGN_COLS		= 6
+};
+/* The columns in SYS_VIRTUAL */
+enum dict_col_sys_virtual_enum {
+	DICT_COL__SYS_VIRTUAL__TABLE_ID		= 0,
+	DICT_COL__SYS_VIRTUAL__POS		= 1,
+	DICT_COL__SYS_VIRTUAL__BASE_POS		= 2,
+	DICT_NUM_COLS__SYS_VIRTUAL		= 3
+};
+/* The field numbers in the SYS_VIRTUAL clustered index */
+enum dict_fld_sys_virtual_enum {
+	DICT_FLD__SYS_VIRTUAL__TABLE_ID		= 0,
+	DICT_FLD__SYS_VIRTUAL__POS		= 1,
+	DICT_FLD__SYS_VIRTUAL__BASE_POS		= 2,
+	DICT_FLD__SYS_VIRTUAL__DB_TRX_ID	= 3,
+	DICT_FLD__SYS_VIRTUAL__DB_ROLL_PTR	= 4,
+	DICT_NUM_FIELDS__SYS_VIRTUAL		= 5
+};
+
+/* A number of the columns above occur in multiple tables.  These are the
+length of thos fields. */
+#define	DICT_FLD_LEN_SPACE	4
+#define	DICT_FLD_LEN_FLAGS	4
+
+#endif
diff --git a/storage/innobase/include/dict0crea.h b/storage/innobase/include/dict0crea.h
new file mode 100644
index 00000000..c40df12b
--- /dev/null
+++ b/storage/innobase/include/dict0crea.h
@@ -0,0 +1,277 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0crea.h
+Database object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0crea_h
+#define dict0crea_h
+
+#include "dict0dict.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+#include "fil0crypt.h"
+
+/*********************************************************************//**
+Creates a table create graph.
+@return own: table create node */
+tab_node_t*
+tab_create_graph_create(
+/*====================*/
+	dict_table_t*	table,		/*!< in: table to create, built as
+					a memory data structure */
+	mem_heap_t*	heap);		/*!< in: heap where created */
+
+/** Creates an index create graph.
+@param[in]	index	index to create, built as a memory data structure
+@param[in]	table	table name
+@param[in,out]	heap	heap where created
+@param[in]	mode	encryption mode (for creating a table)
+@param[in]	key_id	encryption key identifier (for creating a table)
+@param[in]	add_v	new virtual columns added in the same clause with
+			add index
+@return own: index create node */
+ind_node_t*
+ind_create_graph_create(
+	dict_index_t*		index,
+	const char*		table,
+	mem_heap_t*		heap,
+	fil_encryption_t	mode,
+	uint32_t		key_id,
+	const dict_add_v_col_t*	add_v = NULL);
+
+/***********************************************************//**
+Creates a table. This is a high-level function used in SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+dict_create_table_step(
+/*===================*/
+	que_thr_t*	thr);		/*!< in: query thread */
+
+/***********************************************************//**
+Creates an index. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+dict_create_index_step(
+/*===================*/
+	que_thr_t*	thr);		/*!< in: query thread */
+
+/***************************************************************//**
+Builds an index definition but doesn't update sys_table.
+@return DB_SUCCESS or error code */
+void
+dict_build_index_def(
+/*=================*/
+	const dict_table_t*	table,	/*!< in: table */
+	dict_index_t*		index,	/*!< in/out: index */
+	trx_t*			trx);	/*!< in/out: InnoDB transaction
+					handle */
+/***************************************************************//**
+Creates an index tree for the index if it is not a member of a cluster.
+Don't update SYSTEM TABLES.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+dberr_t
+dict_create_index_tree(
+/*===================*/
+	dict_index_t*	index,	/*!< in/out: index */
+	const trx_t*	trx);	/*!< in: InnoDB transaction handle */
+
+/** Drop the index tree associated with a row in SYS_INDEXES table.
+@param[in,out]	pcur	persistent cursor on rec
+@param[in,out]	trx	dictionary transaction
+@param[in,out]	mtr	mini-transaction
+@return tablespace ID to drop (if this is the clustered index)
+@retval 0 if no tablespace is to be dropped */
+uint32_t dict_drop_index_tree(btr_pcur_t *pcur, trx_t *trx, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
+
+/***************************************************************//**
+Creates an index tree for the index if it is not a member of a cluster.
+Don't update SYSTEM TABLES.
+@return	error code */
+dberr_t
+dict_create_index_tree_in_mem(
+/*==========================*/
+	dict_index_t*	index,		/*!< in/out: index */
+	const trx_t*	trx);		/*!< in: InnoDB transaction handle */
+
+/********************************************************************//**
+Generate a foreign key constraint name when it was not named by the user.
+A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER,
+where the numbers start from 1, and are given locally for this table, that is,
+the number is not global, as it used to be before MySQL 4.0.18.  */
+UNIV_INLINE
+dberr_t
+dict_create_add_foreign_id(
+/*=======================*/
+	ulint*		id_nr,		/*!< in/out: number to use in id
+					generation; incremented if used */
+	const char*	name,		/*!< in: table name */
+	dict_foreign_t*	foreign);	/*!< in/out: foreign key */
+
+/** Adds the given set of foreign key objects to the dictionary tables
+in the database. This function does not modify the dictionary cache. The
+caller must ensure that all foreign key objects contain a valid constraint
+name in foreign->id.
+@param[in]	local_fk_set	set of foreign key objects, to be added to
+the dictionary tables
+@param[in]	table		table to which the foreign key objects in
+local_fk_set belong to
+@param[in,out]	trx		transaction
+@return error code or DB_SUCCESS */
+dberr_t
+dict_create_add_foreigns_to_dictionary(
+/*===================================*/
+	const dict_foreign_set&	local_fk_set,
+	const dict_table_t*	table,
+	trx_t*			trx)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Check if a foreign constraint is on columns server as base columns
+of any stored column. This is to prevent creating SET NULL or CASCADE
+constraint on such columns
+@param[in]	local_fk_set	set of foreign key objects, to be added to
+the dictionary tables
+@param[in]	table		table to which the foreign key objects in
+local_fk_set belong to
+@return true if yes, otherwise, false */
+bool
+dict_foreigns_has_s_base_col(
+	const dict_foreign_set&	local_fk_set,
+	const dict_table_t*	table);
+
+/********************************************************************//**
+Add a foreign key definition to the data dictionary tables.
+@return error code or DB_SUCCESS */
+dberr_t
+dict_create_add_foreign_to_dictionary(
+/*==================================*/
+	const char*		name,	/*!< in: table name */
+	const dict_foreign_t*	foreign,/*!< in: foreign key */
+	trx_t*			trx)	/*!< in/out: dictionary transaction */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/* Table create node structure */
+struct tab_node_t{
+	que_common_t	common;		/*!< node type: QUE_NODE_TABLE_CREATE */
+	dict_table_t*	table;		/*!< table to create, built as a
+					memory data structure with
+					dict_mem_... functions */
+	ins_node_t*	tab_def;	/*!< child node which does the insert of
+					the table definition; the row to be
+					inserted is built by the parent node  */
+	ins_node_t*	col_def;	/*!< child node which does the inserts
+					of the column definitions; the row to
+					be inserted is built by the parent
+					node  */
+	ins_node_t*	v_col_def;	/*!< child node which does the inserts
+					of the sys_virtual row definitions;
+					the row to be inserted is built by
+					the parent node  */
+	/*----------------------*/
+	/* Local storage for this graph node */
+	ulint		state;		/*!< node execution state */
+	ulint		col_no;		/*!< next column definition to insert */
+	ulint		base_col_no;	/*!< next base column to insert */
+	mem_heap_t*	heap;		/*!< memory heap used as auxiliary
+					storage */
+};
+
+/* Table create node states */
+#define	TABLE_BUILD_TABLE_DEF	1
+#define	TABLE_BUILD_COL_DEF	2
+#define	TABLE_BUILD_V_COL_DEF	3
+#define	TABLE_ADD_TO_CACHE	4
+#define	TABLE_COMPLETED		5
+
+/* Index create node struct */
+
+struct ind_node_t{
+	que_common_t	common;		/*!< node type: QUE_NODE_INDEX_CREATE */
+	dict_index_t*	index;		/*!< index to create, built as a
+					memory data structure with
+					dict_mem_... functions */
+	const char*	table_name;	/*!< table name */
+	ins_node_t*	ind_def;	/*!< child node which does the insert of
+					the index definition; the row to be
+					inserted is built by the parent node  */
+	ins_node_t*	field_def;	/*!< child node which does the inserts
+					of the field definitions; the row to
+					be inserted is built by the parent
+					node  */
+	/*----------------------*/
+	/* Local storage for this graph node */
+	ulint		state;		/*!< node execution state */
+	uint32_t	page_no;	/* root page number of the index */
+	dtuple_t*	ind_row;	/* index definition row built */
+	ulint		field_no;	/* next field definition to insert */
+	mem_heap_t*	heap;		/*!< memory heap used as auxiliary
+					storage */
+	uint		key_id;		/*!< encryption key_id */
+	fil_encryption_t mode;		/*!< encryption mode */
+	const dict_add_v_col_t*
+			add_v;		/*!< new virtual columns that being
+					added along with an add index call */
+};
+
+/** Compose a column number for a virtual column, stored in the "POS" field
+of Sys_columns. The column number includes both its virtual column sequence
+(the "nth" virtual column) and its actual column position in original table
+@param[in]	v_pos		virtual column sequence
+@param[in]	col_pos		column position in original table definition
+@return	composed column position number */
+UNIV_INLINE
+ulint
+dict_create_v_col_pos(
+	ulint	v_pos,
+	ulint	col_pos);
+
+/** Get the column number for a virtual column (the column position in
+original table), stored in the "POS" field of Sys_columns
+@param[in]      pos             virtual column position
+@return column position in original table */
+UNIV_INLINE
+ulint
+dict_get_v_col_mysql_pos(
+        ulint   pos);
+
+/** Get a virtual column sequence (the "nth" virtual column) for a
+virtual column, stord in the "POS" field of Sys_columns
+@param[in]      pos             virtual column position
+@return virtual column sequence */
+UNIV_INLINE
+ulint
+dict_get_v_col_pos(
+        ulint   pos);
+
+/* Index create node states */
+#define	INDEX_BUILD_INDEX_DEF	1
+#define	INDEX_BUILD_FIELD_DEF	2
+#define	INDEX_CREATE_INDEX_TREE	3
+#define	INDEX_ADD_TO_CACHE	4
+
+#include "dict0crea.inl"
+
+#endif
diff --git a/storage/innobase/include/dict0crea.inl b/storage/innobase/include/dict0crea.inl
new file mode 100644
index 00000000..5641206d
--- /dev/null
+++ b/storage/innobase/include/dict0crea.inl
@@ -0,0 +1,136 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0crea.ic
+Database object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#include "ha_prototypes.h"
+
+#include "mem0mem.h"
+
+/********************************************************************//**
+Generate a foreign key constraint name when it was not named by the user.
+A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER,
+where the numbers start from 1, and are given locally for this table, that is,
+the number is not global, as it used to be before MySQL 4.0.18.  */
+UNIV_INLINE
+dberr_t
+dict_create_add_foreign_id(
+/*=======================*/
+	ulint*		id_nr,	/*!< in/out: number to use in id generation;
+				incremented if used */
+	const char*	name,	/*!< in: table name */
+	dict_foreign_t*	foreign)/*!< in/out: foreign key */
+{
+	DBUG_ENTER("dict_create_add_foreign_id");
+
+	if (foreign->id == NULL) {
+		/* Generate a new constraint id */
+		ulint	namelen	= strlen(name);
+		char*	id	= static_cast<char*>(
+					mem_heap_alloc(foreign->heap,
+						       namelen + 20));
+
+		if (dict_table_t::is_temporary_name(name)) {
+
+			/* no overflow if number < 1e13 */
+			sprintf(id, "%s_ibfk_%lu", name,
+				(ulong) (*id_nr)++);
+		} else {
+			char	table_name[MAX_TABLE_NAME_LEN + 21];
+			uint	errors = 0;
+
+			strncpy(table_name, name, (sizeof table_name) - 1);
+			table_name[(sizeof table_name) - 1] = '\0';
+
+			innobase_convert_to_system_charset(
+				strchr(table_name, '/') + 1,
+				strchr(name, '/') + 1,
+				MAX_TABLE_NAME_LEN, &errors);
+
+			if (errors) {
+				strncpy(table_name, name,
+					(sizeof table_name) - 1);
+				table_name[(sizeof table_name) - 1] = '\0';
+			}
+
+			/* no overflow if number < 1e13 */
+			sprintf(id, "%s_ibfk_%lu", table_name,
+				(ulong) (*id_nr)++);
+
+			if (innobase_check_identifier_length(
+				strchr(id,'/') + 1)) {
+				DBUG_RETURN(DB_IDENTIFIER_TOO_LONG);
+			}
+		}
+		foreign->id = id;
+
+		DBUG_PRINT("dict_create_add_foreign_id",
+			   ("generated foreign id: %s", id));
+	}
+
+
+	DBUG_RETURN(DB_SUCCESS);
+}
+
+/** Compose a column number for a virtual column, stored in the "POS" field
+of Sys_columns. The column number includes both its virtual column sequence
+(the "nth" virtual column) and its actual column position in original table
+@param[in]	v_pos		virtual column sequence
+@param[in]	col_pos		column position in original table definition
+@return composed column position number */
+UNIV_INLINE
+ulint
+dict_create_v_col_pos(
+	ulint	v_pos,
+	ulint	col_pos)
+{
+	ut_ad(v_pos <= REC_MAX_N_FIELDS);
+	ut_ad(col_pos <= REC_MAX_N_FIELDS);
+
+	return(((v_pos + 1) << 16) + col_pos);
+}
+
+/** Get the column number for a virtual column (the column position in
+original table), stored in the "POS" field of Sys_columns
+@param[in]	pos		virtual column position
+@return column position in original table */
+UNIV_INLINE
+ulint
+dict_get_v_col_mysql_pos(
+	ulint	pos)
+{
+	return(pos & 0xFFFF);
+}
+
+/** Get a virtual column sequence (the "nth" virtual column) for a
+virtual column, stord in the "POS" field of Sys_columns
+@param[in]	pos		virtual column position
+@return virtual column sequence */
+UNIV_INLINE
+ulint
+dict_get_v_col_pos(
+	ulint	pos)
+{
+	return((pos >> 16) - 1);
+}
diff --git a/storage/innobase/include/dict0defrag_bg.h b/storage/innobase/include/dict0defrag_bg.h
new file mode 100644
index 00000000..679484ad
--- /dev/null
+++ b/storage/innobase/include/dict0defrag_bg.h
@@ -0,0 +1,101 @@
+/*****************************************************************************
+
+Copyright (c) 2016, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0defrag_bg.h
+Code used for background table and index
+defragmentation
+
+Created 25/08/2016 Jan Lindström
+*******************************************************/
+
+#ifndef dict0defrag_bg_h
+#define dict0defrag_bg_h
+
+#include "dict0types.h"
+
+/** Indices whose defrag stats need to be saved to persistent storage.*/
+struct defrag_pool_item_t {
+	table_id_t	table_id;
+	index_id_t	index_id;
+};
+
+/** Allocator type, used by std::vector */
+typedef ut_allocator<defrag_pool_item_t>
+	defrag_pool_allocator_t;
+
+/** The multitude of tables to be defragmented- an STL vector */
+typedef std::vector<defrag_pool_item_t, defrag_pool_allocator_t>
+	defrag_pool_t;
+
+/** Pool where we store information on which tables are to be processed
+by background defragmentation. */
+extern defrag_pool_t		defrag_pool;
+
+/*****************************************************************//**
+Initialize the defrag pool, called once during thread initialization. */
+void
+dict_defrag_pool_init(void);
+/*========================*/
+
+/*****************************************************************//**
+Free the resources occupied by the defrag pool, called once during
+thread de-initialization. */
+void
+dict_defrag_pool_deinit(void);
+/*==========================*/
+
+/*****************************************************************//**
+Add an index in a table to the defrag pool, which is processed by the
+background stats gathering thread. Only the table id and index id are
+added to the list, so the table can be closed after being enqueued and
+it will be opened when needed. If the table or index does not exist later
+(has been DROPped), then it will be removed from the pool and skipped. */
+void
+dict_stats_defrag_pool_add(
+/*=======================*/
+	const dict_index_t*	index);	/*!< in: table to add */
+
+/*****************************************************************//**
+Delete a given index from the auto defrag pool. */
+void
+dict_stats_defrag_pool_del(
+/*=======================*/
+	const dict_table_t*	table,	/*!<in: if given, remove
+					all entries for the table */
+	const dict_index_t*	index);	/*!< in: index to remove */
+
+/**
+Get the first index that has been added for updating persistent defrag
+stats and eventually save its stats. */
+void dict_defrag_process_entries_from_defrag_pool(THD *thd);
+
+/*********************************************************************//**
+Save defragmentation result.
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_save_defrag_summary(dict_index_t *index, THD *thd)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Save defragmentation stats for a given index.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_stats_save_defrag_stats(
+/*============================*/
+	dict_index_t*	index);	/*!< in: index */
+#endif /* dict0defrag_bg_h */
diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h
new file mode 100644
index 00000000..5fafb2c5
--- /dev/null
+++ b/storage/innobase/include/dict0dict.h
@@ -0,0 +1,1744 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0dict.h
+Data dictionary system
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0dict_h
+#define dict0dict_h
+
+#include "data0data.h"
+#include "dict0mem.h"
+#include "fsp0fsp.h"
+#include "srw_lock.h"
+#include <my_sys.h>
+#include <deque>
+
+class MDL_ticket;
+
+/** the first table or index ID for other than hard-coded system tables */
+constexpr uint8_t DICT_HDR_FIRST_ID= 10;
+
+
+/** Get the database name length in a table name.
+@param name   filename-safe encoded table name "dbname/tablename"
+@return database name length */
+inline size_t dict_get_db_name_len(const char *name)
+{
+  /* table_name_t::dblen() would assert that '/' is contained */
+  if (const char* s= strchr(name, '/'))
+    return size_t(s - name);
+
+  return 0;
+}
+
+
+/*********************************************************************//**
+Open a table from its database and table name, this is currently used by
+foreign constraint parser to get the referenced table.
+@return complete table name with database and table name, allocated from
+heap memory passed in */
+char*
+dict_get_referenced_table(
+/*======================*/
+	const char*	name,		/*!< in: foreign key table name */
+	const char*	database_name,	/*!< in: table db name */
+	ulint		database_name_len,/*!< in: db name length */
+	const char*	table_name,	/*!< in: table name */
+	ulint		table_name_len,	/*!< in: table name length */
+	dict_table_t**	table,		/*!< out: table object or NULL */
+	mem_heap_t*	heap,		/*!< in: heap memory */
+	CHARSET_INFO*	from_cs);	/*!< in: table name charset */
+/*********************************************************************//**
+Frees a foreign key struct. */
+void
+dict_foreign_free(
+/*==============*/
+	dict_foreign_t*	foreign);	/*!< in, own: foreign key struct */
+/*********************************************************************//**
+Finds the highest [number] for foreign key constraints of the table. Looks
+only at the >= 4.0.18-format id's, which are of the form
+databasename/tablename_ibfk_[number].
+@return highest number, 0 if table has no new format foreign key constraints */
+ulint
+dict_table_get_highest_foreign_id(
+/*==============================*/
+	dict_table_t*	table);		/*!< in: table in the dictionary
+					memory cache */
+/** Check whether the dict_table_t is a partition.
+A partitioned table on the SQL level is composed of InnoDB tables,
+where each InnoDB table is a [sub]partition including its secondary indexes
+which belongs to the partition.
+@param[in]	table	Table to check.
+@return true if the dict_table_t is a partition else false. */
+UNIV_INLINE
+bool
+dict_table_is_partition(const dict_table_t* table)
+{
+	/* Check both P and p on all platforms in case it was moved to/from
+	WIN. */
+	return (strstr(table->name.m_name, "#p#")
+		|| strstr(table->name.m_name, "#P#"));
+}
+/********************************************************************//**
+Return the end of table name where we have removed dbname and '/'.
+@return table name */
+const char*
+dict_remove_db_name(
+/*================*/
+	const char*	name)	/*!< in: table name in the form
+				dbname '/' tablename */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Operation to perform when opening a table */
+enum dict_table_op_t {
+	/** Expect the tablespace to exist. */
+	DICT_TABLE_OP_NORMAL = 0,
+	/** Drop any orphan indexes after an aborted online index creation */
+	DICT_TABLE_OP_DROP_ORPHAN,
+	/** Silently load the tablespace if it does not exist,
+	and do not load the definitions of incomplete indexes. */
+	DICT_TABLE_OP_LOAD_TABLESPACE,
+	/** Open the table only if it's in table cache. */
+	DICT_TABLE_OP_OPEN_ONLY_IF_CACHED
+};
+
+/** Acquire MDL shared for the table name.
+@tparam trylock whether to use non-blocking operation
+@param[in,out]  table           table object
+@param[in,out]  thd             background thread
+@param[out]     mdl             mdl ticket
+@param[in]      table_op        operation to perform when opening
+@return table object after locking MDL shared
+@retval NULL if the table is not readable, or if trylock && MDL blocked */
+template<bool trylock>
+dict_table_t*
+dict_acquire_mdl_shared(dict_table_t *table,
+                        THD *thd,
+                        MDL_ticket **mdl,
+                        dict_table_op_t table_op= DICT_TABLE_OP_NORMAL);
+
+/** Look up a table by numeric identifier.
+@param[in]      table_id        table identifier
+@param[in]      dict_locked     data dictionary locked
+@param[in]      table_op        operation to perform when opening
+@param[in,out]  thd             background thread, or NULL to not acquire MDL
+@param[out]     mdl             mdl ticket, or NULL
+@return table, NULL if does not exist */
+dict_table_t*
+dict_table_open_on_id(table_id_t table_id, bool dict_locked,
+                      dict_table_op_t table_op, THD *thd= nullptr,
+                      MDL_ticket **mdl= nullptr)
+  MY_ATTRIBUTE((warn_unused_result));
+
+/** Decrement the count of open handles */
+void dict_table_close(dict_table_t *table);
+
+/** Decrements the count of open handles of a table.
+@param[in,out]	table		table
+@param[in]	dict_locked	whether dict_sys.latch is being held
+@param[in]	thd		thread to release MDL
+@param[in]	mdl		metadata lock or NULL if the thread is a
+				foreground one. */
+void
+dict_table_close(
+	dict_table_t*	table,
+	bool		dict_locked,
+	THD*		thd = NULL,
+	MDL_ticket*	mdl = NULL);
+
+/*********************************************************************//**
+Gets the minimum number of bytes per character.
+@return minimum multi-byte char size, in bytes */
+UNIV_INLINE
+unsigned
+dict_col_get_mbminlen(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the maximum number of bytes per character.
+@return maximum multi-byte char size, in bytes */
+UNIV_INLINE
+unsigned
+dict_col_get_mbmaxlen(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the column data type. */
+UNIV_INLINE
+void
+dict_col_copy_type(
+/*===============*/
+	const dict_col_t*	col,	/*!< in: column */
+	dtype_t*		type);	/*!< out: data type */
+
+/**********************************************************************//**
+Determine bytes of column prefix to be stored in the undo log. Please
+note that if !dict_table_has_atomic_blobs(table), no prefix
+needs to be stored in the undo log.
+@return bytes of column prefix to be stored in the undo log */
+UNIV_INLINE
+ulint
+dict_max_field_len_store_undo(
+/*==========================*/
+	dict_table_t*		table,	/*!< in: table */
+	const dict_col_t*	col)	/*!< in: column which index prefix
+					is based on */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Determine maximum bytes of a virtual column need to be stored
+in the undo log.
+@param[in]	table		dict_table_t for the table
+@param[in]	col_no		virtual column number
+@return maximum bytes of virtual column to be stored in the undo log */
+UNIV_INLINE
+ulint
+dict_max_v_field_len_store_undo(
+	dict_table_t*		table,
+	ulint			col_no);
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Assert that a column and a data type match.
+@return TRUE */
+UNIV_INLINE
+ibool
+dict_col_type_assert_equal(
+/*=======================*/
+	const dict_col_t*	col,	/*!< in: column */
+	const dtype_t*		type)	/*!< in: data type */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
+
+/***********************************************************************//**
+Returns the minimum size of the column.
+@return minimum size */
+UNIV_INLINE
+unsigned
+dict_col_get_min_size(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************************//**
+Returns the maximum size of the column.
+@return maximum size */
+UNIV_INLINE
+ulint
+dict_col_get_max_size(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************************//**
+Returns the size of a fixed size column, 0 if not a fixed size column.
+@return fixed size, or 0 */
+UNIV_INLINE
+unsigned
+dict_col_get_fixed_size(
+/*====================*/
+	const dict_col_t*	col,	/*!< in: column */
+	ulint			comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+unsigned
+dict_col_get_sql_null_size(
+/*=======================*/
+	const dict_col_t*	col,	/*!< in: column */
+	ulint			comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the column number.
+@return col->ind, table column position (starting from 0) */
+UNIV_INLINE
+unsigned
+dict_col_get_no(
+/*============*/
+	const dict_col_t*	col)	/*!< in: column */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the column position in the clustered index. */
+UNIV_INLINE
+ulint
+dict_col_get_clust_pos(
+/*===================*/
+	const dict_col_t*	col,		/*!< in: table column */
+	const dict_index_t*	clust_index)	/*!< in: clustered index */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Gets the column position in the given index.
+@param[in]	col	table column
+@param[in]	index	index to be searched for column
+@return position of column in the given index. */
+UNIV_INLINE
+ulint
+dict_col_get_index_pos(
+	const dict_col_t*	col,
+	const dict_index_t*	index)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/****************************************************************//**
+If the given column name is reserved for InnoDB system columns, return
+TRUE.
+@return TRUE if name is reserved */
+ibool
+dict_col_name_is_reserved(
+/*======================*/
+	const char*	name)	/*!< in: column name */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Unconditionally set the AUTO_INCREMENT counter.
+@param[in,out]	table	table or partition
+@param[in]	value	next available AUTO_INCREMENT value */
+MY_ATTRIBUTE((nonnull))
+UNIV_INLINE
+void
+dict_table_autoinc_initialize(dict_table_t* table, ib_uint64_t value)
+{
+	table->autoinc = value;
+}
+
+/**
+@param[in]	table	table or partition
+@return the next AUTO_INCREMENT counter value
+@retval	0	if AUTO_INCREMENT is not yet initialized */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+UNIV_INLINE
+ib_uint64_t
+dict_table_autoinc_read(const dict_table_t* table)
+{
+	return(table->autoinc);
+}
+
+/** Update the AUTO_INCREMENT sequence if the value supplied is greater
+than the current value.
+@param[in,out]	table	table or partition
+@param[in]	value	AUTO_INCREMENT value that was assigned to a row
+@return	whether the AUTO_INCREMENT sequence was updated */
+MY_ATTRIBUTE((nonnull))
+UNIV_INLINE
+bool
+dict_table_autoinc_update_if_greater(dict_table_t* table, ib_uint64_t value)
+{
+	if (value > table->autoinc) {
+
+		table->autoinc = value;
+		return(true);
+	}
+
+	return(false);
+}
+
+/**********************************************************************//**
+Adds system columns to a table object. */
+void
+dict_table_add_system_columns(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	mem_heap_t*	heap)	/*!< in: temporary heap */
+	MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Renames a table object.
+@return TRUE if success */
+dberr_t
+dict_table_rename_in_cache(
+/*=======================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	span<const char> new_name,	/*!< in: new name */
+	bool		replace_new_file)
+					/*!< in: whether to replace the
+					file with the new name
+					(as part of rolling back TRUNCATE) */
+	MY_ATTRIBUTE((nonnull));
+
+/** Removes an index from the dictionary cache.
+@param[in,out]	table	table whose index to remove
+@param[in,out]	index	index to remove, this object is destroyed and must not
+be accessed by the caller afterwards */
+void
+dict_index_remove_from_cache(
+	dict_table_t*	table,
+	dict_index_t*	index);
+
+/**********************************************************************//**
+Change the id of a table object in the dictionary cache. This is used in
+DISCARD TABLESPACE. */
+void
+dict_table_change_id_in_cache(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table object already in cache */
+	table_id_t	new_id)	/*!< in: new id to set */
+	MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Removes a foreign constraint struct from the dictionary cache. */
+void
+dict_foreign_remove_from_cache(
+/*===========================*/
+	dict_foreign_t*	foreign)	/*!< in, own: foreign constraint */
+	MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Adds a foreign key constraint object to the dictionary cache. May free
+the object if there already is an object with the same identifier in.
+At least one of foreign table or referenced table must already be in
+the dictionary cache!
+@return DB_SUCCESS or error code */
+dberr_t
+dict_foreign_add_to_cache(
+/*======================*/
+	dict_foreign_t*		foreign,
+				/*!< in, own: foreign key constraint */
+	const char**		col_names,
+				/*!< in: column names, or NULL to use
+				foreign->foreign_table->col_names */
+	bool			check_charsets,
+				/*!< in: whether to check charset
+				compatibility */
+	dict_err_ignore_t	ignore_err)
+				/*!< in: error to be ignored */
+	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+/**********************************************************************//**
+Replace the index passed in with another equivalent index in the
+foreign key lists of the table.
+@return whether all replacements were found */
+bool
+dict_foreign_replace_index(
+/*=======================*/
+	dict_table_t*		table,  /*!< in/out: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	const dict_index_t*	index)	/*!< in: index to be replaced */
+	MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
+/**********************************************************************//**
+Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement.
+@return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the
+constraint id does not match */
+dberr_t
+dict_foreign_parse_drop_constraints(
+/*================================*/
+	mem_heap_t*	heap,			/*!< in: heap from which we can
+						allocate memory */
+	trx_t*		trx,			/*!< in: transaction */
+	dict_table_t*	table,			/*!< in: table */
+	ulint*		n,			/*!< out: number of constraints
+						to drop */
+	const char***	constraints_to_drop)	/*!< out: id's of the
+						constraints to drop */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/**********************************************************************//**
+Returns a table object and increments its open handle count.
+NOTE! This is a high-level function to be used mainly from outside the
+'dict' directory. Inside this directory dict_table_get_low
+is usually the appropriate function.
+@param[in] table_name Table name
+@param[in] dict_locked whether dict_sys.latch is being held exclusively
+@param[in] ignore_err error to be ignored when loading the table
+@return table
+@retval nullptr if does not exist */
+dict_table_t*
+dict_table_open_on_name(
+	const char*		table_name,
+	bool			dict_locked,
+	dict_err_ignore_t	ignore_err)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Outcome of dict_foreign_find_index() or dict_foreign_qualify_index() */
+enum fkerr_t
+{
+  /** A backing index was found for a FOREIGN KEY constraint */
+  FK_SUCCESS = 0,
+  /** There is no index that covers the columns in the constraint. */
+  FK_INDEX_NOT_FOUND,
+  /** The index is for a prefix index, not a full column. */
+  FK_IS_PREFIX_INDEX,
+  /** A condition of SET NULL conflicts with a NOT NULL column. */
+  FK_COL_NOT_NULL,
+  /** The column types do not match */
+  FK_COLS_NOT_EQUAL
+};
+
+/*********************************************************************//**
+Tries to find an index whose first fields are the columns in the array,
+in the same order and is not marked for deletion and is not the same
+as types_idx.
+@return matching index, NULL if not found */
+dict_index_t*
+dict_foreign_find_index(
+/*====================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	const char**		columns,/*!< in: array of column names */
+	ulint			n_cols,	/*!< in: number of columns */
+	const dict_index_t*	types_idx,
+					/*!< in: NULL or an index
+					whose types the column types
+					must match */
+	bool			check_charsets,
+					/*!< in: whether to check
+					charsets.  only has an effect
+					if types_idx != NULL */
+	ulint			check_null,
+					/*!< in: nonzero if none of
+					the columns must be declared
+					NOT NULL */
+	fkerr_t*		error = NULL,	/*!< out: error code */
+	ulint*			err_col_no = NULL,
+					/*!< out: column number where
+					error happened */
+	dict_index_t**		err_index = NULL)
+					/*!< out: index where error
+					happened */
+
+	MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
+
+/** Returns a virtual column's name.
+@param[in]	table		table object
+@param[in]	col_nr		virtual column number(nth virtual column)
+@return column name. */
+const char*
+dict_table_get_v_col_name(
+	const dict_table_t*	table,
+	ulint			col_nr);
+
+/** Check if the table has a given column.
+@param[in]	table		table object
+@param[in]	col_name	column name
+@param[in]	col_nr		column number guessed, 0 as default
+@return column number if the table has the specified column,
+otherwise table->n_def */
+ulint
+dict_table_has_column(
+	const dict_table_t*	table,
+	const char*		col_name,
+	ulint			col_nr = 0);
+
+/**********************************************************************//**
+Outputs info on foreign keys of a table. */
+std::string
+dict_print_info_on_foreign_keys(
+/*============================*/
+	ibool		create_table_format, /*!< in: if TRUE then print in
+				a format suitable to be inserted into
+				a CREATE TABLE, otherwise in the format
+				of SHOW TABLE STATUS */
+	trx_t*		trx,	/*!< in: transaction */
+	dict_table_t*	table);	/*!< in: table */
+
+/**********************************************************************//**
+Outputs info on a foreign key of a table in a format suitable for
+CREATE TABLE. */
+std::string
+dict_print_info_on_foreign_key_in_create_format(
+/*============================================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint */
+	ibool		add_newline);	/*!< in: whether to add a newline */
+
+/*********************************************************************//**
+Tries to find an index whose first fields are the columns in the array,
+in the same order and is not marked for deletion and is not the same
+as types_idx.
+@return matching index, NULL if not found */
+bool
+dict_foreign_qualify_index(
+/*====================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	const char**		columns,/*!< in: array of column names */
+	ulint			n_cols,	/*!< in: number of columns */
+	const dict_index_t*	index,	/*!< in: index to check */
+	const dict_index_t*	types_idx,
+					/*!< in: NULL or an index
+					whose types the column types
+					must match */
+	bool			check_charsets,
+					/*!< in: whether to check
+					charsets.  only has an effect
+					if types_idx != NULL */
+	ulint			check_null,
+					/*!< in: nonzero if none of
+					the columns must be declared
+					NOT NULL */
+	fkerr_t*		error,	/*!< out: error code */
+	ulint*			err_col_no,
+					/*!< out: column number where
+					error happened */
+	dict_index_t**		err_index)
+					/*!< out: index where error
+					happened */
+	MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the first index on the table (the clustered index).
+@return index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_first_index(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the last index on the table.
+@return index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_last_index(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the next index on the table.
+@return index, NULL if none left */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_next_index(
+/*======================*/
+	const dict_index_t*	index)	/*!< in: index */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+#else /* UNIV_DEBUG */
+# define dict_table_get_first_index(table) UT_LIST_GET_FIRST((table)->indexes)
+# define dict_table_get_last_index(table) UT_LIST_GET_LAST((table)->indexes)
+# define dict_table_get_next_index(index) UT_LIST_GET_NEXT(indexes, index)
+#endif /* UNIV_DEBUG */
+
+#define dict_index_is_clust(index) (index)->is_clust()
+#define dict_index_is_auto_gen_clust(index) (index)->is_gen_clust()
+#define dict_index_is_unique(index) (index)->is_unique()
+#define dict_index_is_spatial(index) (index)->is_spatial()
+#define dict_index_is_ibuf(index) (index)->is_ibuf()
+#define dict_index_is_sec_or_ibuf(index) !(index)->is_primary()
+#define dict_index_has_virtual(index) (index)->has_virtual()
+
+/** Get all the FTS indexes on a table.
+@param[in]	table	table
+@param[out]	indexes	all FTS indexes on this table
+@return number of FTS indexes */
+ulint
+dict_table_get_all_fts_indexes(
+	const dict_table_t*	table,
+	ib_vector_t*		indexes);
+
+/********************************************************************//**
+Gets the number of user-defined non-virtual columns in a table in the
+dictionary cache.
+@return number of user-defined (e.g., not ROW_ID) non-virtual
+columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_user_cols(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+	MY_ATTRIBUTE((warn_unused_result));
+/********************************************************************//**
+Gets the number of all non-virtual columns (also system) in a table
+in the dictionary cache.
+@return number of columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_cols(
+/*==================*/
+	const dict_table_t*	table)	/*!< in: table */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Gets the number of virtual columns in a table in the dictionary cache.
+@param[in]	table	the table to check
+@return number of virtual columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_v_cols(
+	const dict_table_t*	table);
+
+/** Check if a table has indexed virtual columns
+@param[in]	table	the table to check
+@return true is the table has indexed virtual columns */
+UNIV_INLINE
+bool
+dict_table_has_indexed_v_cols(
+	const dict_table_t*	table);
+
+/********************************************************************//**
+Gets the approximately estimated number of rows in the table.
+@return estimated number of rows */
+UNIV_INLINE
+ib_uint64_t
+dict_table_get_n_rows(
+/*==================*/
+	const dict_table_t*	table)	/*!< in: table */
+	MY_ATTRIBUTE((warn_unused_result));
+/********************************************************************//**
+Increment the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_inc(
+/*==================*/
+	dict_table_t*	table)	/*!< in/out: table */
+	MY_ATTRIBUTE((nonnull));
+/********************************************************************//**
+Decrement the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_dec(
+/*==================*/
+	dict_table_t*	table)	/*!< in/out: table */
+	MY_ATTRIBUTE((nonnull));
+
+/** Get nth virtual column
+@param[in]	table	target table
+@param[in]	col_nr	column number in MySQL Table definition
+@return dict_v_col_t ptr */
+dict_v_col_t*
+dict_table_get_nth_v_col_mysql(
+	const dict_table_t*	table,
+	ulint			col_nr);
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_nth_col(
+/*===================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			pos)	/*!< in: position of column */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Gets the nth virtual column of a table.
+@param[in]	table	table
+@param[in]	pos	position of virtual column
+@return pointer to virtual column object */
+UNIV_INLINE
+dict_v_col_t*
+dict_table_get_nth_v_col(
+        const dict_table_t*	table,
+        ulint			pos);
+/********************************************************************//**
+Gets the given system column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_sys_col(
+/*===================*/
+	const dict_table_t*	table,	/*!< in: table */
+	unsigned		sys)	/*!< in: DATA_ROW_ID, ... */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+#else /* UNIV_DEBUG */
+#define dict_table_get_nth_col(table, pos)	(&(table)->cols[pos])
+#define dict_table_get_sys_col(table, sys)	\
+	&(table)->cols[(table)->n_cols + (sys) - DATA_N_SYS_COLS]
+/* Get nth virtual columns */
+#define dict_table_get_nth_v_col(table, pos)	(&(table)->v_cols[pos])
+#endif /* UNIV_DEBUG */
+/** Wrapper function.
+@see dict_col_t::name()
+@param[in]	table	table
+@param[in]	col_nr	column number in table
+@return	column name */
+inline
+const char*
+dict_table_get_col_name(const dict_table_t* table, ulint col_nr)
+{
+	return(dict_table_get_nth_col(table, col_nr)->name(*table));
+}
+
+/********************************************************************//**
+Gets the given system column number of a table.
+@return column number */
+UNIV_INLINE
+unsigned
+dict_table_get_sys_col_no(
+/*======================*/
+	const dict_table_t*	table,	/*!< in: table */
+	unsigned		sys)	/*!< in: DATA_ROW_ID, ... */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/********************************************************************//**
+Returns the minimum data size of an index record.
+@return minimum data size in bytes */
+UNIV_INLINE
+unsigned
+dict_index_get_min_size(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: index */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#define dict_table_is_comp(table) (table)->not_redundant()
+
+/** Determine if a table uses atomic BLOBs (no locally stored prefix).
+@param[in]	table	InnoDB table
+@return whether BLOBs are atomic */
+inline
+bool
+dict_table_has_atomic_blobs(const dict_table_t* table)
+{
+	return(DICT_TF_HAS_ATOMIC_BLOBS(table->flags));
+}
+
+/** @return potential max length stored inline for externally stored fields */
+inline size_t dict_table_t::get_overflow_field_local_len() const
+{
+	if (dict_table_has_atomic_blobs(this)) {
+		/* ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED: do not
+		store any BLOB prefix locally */
+		return BTR_EXTERN_FIELD_REF_SIZE;
+	}
+	/* up to MySQL 5.1: store a 768-byte prefix locally */
+	return BTR_EXTERN_FIELD_REF_SIZE + DICT_ANTELOPE_MAX_INDEX_COL_LEN;
+}
+
+/** Set the various values in a dict_table_t::flags pointer.
+@param[in,out]	flags,		Pointer to a 4 byte Table Flags
+@param[in]	format,		File Format
+@param[in]	zip_ssize	Zip Shift Size
+@param[in]	use_data_dir	Table uses DATA DIRECTORY
+@param[in]	page_compressed Table uses page compression
+@param[in]	page_compression_level Page compression level */
+UNIV_INLINE
+void
+dict_tf_set(
+	ulint*		flags,
+	rec_format_t	format,
+	ulint		zip_ssize,
+	bool		use_data_dir,
+	bool		page_compressed,
+	ulint		page_compression_level);
+
+/** Convert a 32 bit integer table flags to the 32 bit FSP Flags.
+Fsp Flags are written into the tablespace header at the offset
+FSP_SPACE_FLAGS and are also stored in the fil_space_t::flags field.
+The following chart shows the translation of the low order bit.
+Other bits are the same.
+========================= Low order bit ==========================
+                    | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC
+dict_table_t::flags |     0     |    1    |     1      |    1
+fil_space_t::flags  |     0     |    0    |     1      |    1
+==================================================================
+@param[in]	table_flags	dict_table_t::flags
+@return tablespace flags (fil_space_t::flags) */
+inline uint32_t dict_tf_to_fsp_flags(unsigned table_flags)
+  MY_ATTRIBUTE((const));
+
+/** Extract the ROW_FORMAT=COMPRESSED page size from table flags.
+@param[in]	flags	flags
+@return ROW_FORMAT=COMPRESSED page size
+@retval	0 if not compressed */
+inline ulint dict_tf_get_zip_size(ulint flags)
+{
+	flags &= DICT_TF_MASK_ZIP_SSIZE;
+	return flags
+		? (UNIV_ZIP_SIZE_MIN >> 1)
+		<< (FSP_FLAGS_GET_ZIP_SSIZE(flags >> DICT_TF_POS_ZIP_SSIZE
+					    << FSP_FLAGS_POS_ZIP_SSIZE))
+		: 0;
+}
+
+/********************************************************************//**
+Checks if a column is in the ordering columns of the clustered index of a
+table. Column prefixes are treated like whole columns.
+@return TRUE if the column, or its prefix, is in the clustered key */
+ibool
+dict_table_col_in_clustered_key(
+/*============================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			n)	/*!< in: column number */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*******************************************************************//**
+Check if the table has an FTS index.
+@return TRUE if table has an FTS index */
+UNIV_INLINE
+ibool
+dict_table_has_fts_index(
+/*=====================*/
+	dict_table_t*   table)		/*!< in: table */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Copies types of virtual columns contained in table to tuple and sets all
+fields of the tuple to the SQL NULL value.  This function should
+be called right after dtuple_create().
+@param[in,out]	tuple	data tuple
+@param[in]	table	table
+*/
+void
+dict_table_copy_v_types(
+	dtuple_t*		tuple,
+	const dict_table_t*	table);
+
+/*******************************************************************//**
+Copies types of columns contained in table to tuple and sets all
+fields of the tuple to the SQL NULL value.  This function should
+be called right after dtuple_create(). */
+void
+dict_table_copy_types(
+/*==================*/
+	dtuple_t*		tuple,	/*!< in/out: data tuple */
+	const dict_table_t*	table)	/*!< in: table */
+	MY_ATTRIBUTE((nonnull));
+/** Adds an index to the dictionary cache, with possible indexing newly
+added column.
+@param[in,out]	index	index; NOTE! The index memory
+			object is freed in this function!
+@param[in]	page_no	root page number of the index
+@param[in]	add_v	virtual columns being added along with ADD INDEX
+@return DB_SUCCESS, or DB_CORRUPTION */
+dberr_t
+dict_index_add_to_cache(
+	dict_index_t*&		index,
+	ulint			page_no,
+	const dict_add_v_col_t* add_v = NULL)
+	MY_ATTRIBUTE((warn_unused_result));
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index,
+including fields added by the dictionary system.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_fields(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: an internal
+					representation of index (in
+					the dictionary cache) */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+that uniquely determine the position of an index entry in the index, if
+we do not take multiversioning into account: in the B-tree use the value
+returned by dict_index_get_n_unique_in_tree.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+which uniquely determine the position of an index entry in the index, if
+we also take multiversioning into account.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique_in_tree(
+/*============================*/
+	const dict_index_t*	index)	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** The number of fields in the nonleaf page of spatial index, except
+the page no field. */
+#define DICT_INDEX_SPATIAL_NODEPTR_SIZE	1
+/**
+Gets the number of fields on nonleaf page level in the internal representation
+of an index which uniquely determine the position of an index entry in the
+index, if we also take multiversioning into account. Note, it doesn't
+include page no field.
+@param[in]	index	index
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique_in_tree_nonleaf(
+	const dict_index_t*	index)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the number of user-defined ordering fields in the index. In the internal
+representation we add the row id to the ordering fields to make all indexes
+unique, but this function returns the number of fields the user defined
+in the index as ordering fields.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_ordering_defined_by_user(
+/*======================================*/
+	const dict_index_t*	index)	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth field of an index.
+@return pointer to field object */
+UNIV_INLINE
+dict_field_t*
+dict_index_get_nth_field(
+/*=====================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos)	/*!< in: position of field */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+#else /* UNIV_DEBUG */
+# define dict_index_get_nth_field(index, pos) ((index)->fields + (pos))
+#endif /* UNIV_DEBUG */
+/********************************************************************//**
+Gets pointer to the nth column in an index.
+@return column */
+UNIV_INLINE
+const dict_col_t*
+dict_index_get_nth_col(
+/*===================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos)	/*!< in: position of the field */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the column number of the nth field in an index.
+@return column number */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_no(
+/*======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos)	/*!< in: position of the field */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Looks for column n in an index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_pos(
+/*=======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			n,	/*!< in: column number */
+	ulint*			prefix_col_pos) /*!< out: col num if prefix */
+	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/** Looks for column n in an index.
+@param[in]	index		index
+@param[in]	n		column number
+@param[in]	inc_prefix	true=consider column prefixes too
+@param[in]	is_virtual	true==virtual column
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+ulint
+dict_index_get_nth_col_or_prefix_pos(
+	const dict_index_t*	index,		/*!< in: index */
+	ulint			n,		/*!< in: column number */
+	bool			inc_prefix,	/*!< in: TRUE=consider
+						column prefixes too */
+	bool			is_virtual,	/*!< in: is a virtual column
+						*/
+	ulint*			prefix_col_pos) /*!< out: col num if prefix
+						*/
+	__attribute__((warn_unused_result));
+/********************************************************************//**
+Looks for a matching field in an index. The column has to be the same. The
+column in index must be complete, or must contain a prefix longer than the
+column in index2. That is, we must be able to construct the prefix in index2
+from the prefix in index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+ulint
+dict_index_get_nth_field_pos(
+/*=========================*/
+	const dict_index_t*	index,	/*!< in: index from which to search */
+	const dict_index_t*	index2,	/*!< in: index */
+	ulint			n)	/*!< in: field number in index2 */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Looks for column n position in the clustered index.
+@return position in internal representation of the clustered index */
+unsigned
+dict_table_get_nth_col_pos(
+/*=======================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			n,	/*!< in: column number */
+	ulint*			prefix_col_pos) /*!< out: col num if prefix */
+	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+/** Add a column to an index.
+@param index          index
+@param table          table
+@param col            column
+@param prefix_len     column prefix length
+@param descending     whether to use descending order */
+void dict_index_add_col(dict_index_t *index, const dict_table_t *table,
+                        dict_col_t *col, ulint prefix_len,
+                        bool descending= false)
+  MY_ATTRIBUTE((nonnull));
+
+/*******************************************************************//**
+Copies types of fields contained in index to tuple. */
+void
+dict_index_copy_types(
+/*==================*/
+	dtuple_t*		tuple,		/*!< in/out: data tuple */
+	const dict_index_t*	index,		/*!< in: index */
+	ulint			n_fields)	/*!< in: number of
+						field types to copy */
+	MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Gets the field column.
+@return field->col, pointer to the table column */
+UNIV_INLINE
+const dict_col_t*
+dict_field_get_col(
+/*===============*/
+	const dict_field_t*	field)	/*!< in: index field */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+@return index, NULL if not found */
+dict_index_t*
+dict_index_get_if_in_cache_low(
+/*===========================*/
+	index_id_t	index_id)	/*!< in: index id */
+	MY_ATTRIBUTE((warn_unused_result));
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+@return index, NULL if not found */
+dict_index_t*
+dict_index_get_if_in_cache(
+/*=======================*/
+	index_id_t	index_id)	/*!< in: index id */
+	MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************************//**
+Checks that a tuple has n_fields_cmp value in a sensible range, so that
+no comparison can occur with the page number field in a node pointer.
+@return TRUE if ok */
+ibool
+dict_index_check_search_tuple(
+/*==========================*/
+	const dict_index_t*	index,	/*!< in: index tree */
+	const dtuple_t*		tuple)	/*!< in: tuple used in a search */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Whether and when to allow temporary index names */
+enum check_name {
+	/** Require all indexes to be complete. */
+	CHECK_ALL_COMPLETE,
+	/** Allow aborted online index creation. */
+	CHECK_ABORTED_OK,
+	/** Allow partial indexes to exist. */
+	CHECK_PARTIAL_OK
+};
+/**********************************************************************//**
+Check for duplicate index entries in a table [using the index name] */
+void
+dict_table_check_for_dup_indexes(
+/*=============================*/
+	const dict_table_t*	table,	/*!< in: Check for dup indexes
+					in this table */
+	enum check_name		check)	/*!< in: whether and when to allow
+					temporary index names */
+	MY_ATTRIBUTE((nonnull));
+#endif /* UNIV_DEBUG */
+/**********************************************************************//**
+Builds a node pointer out of a physical record and a page number.
+@return own: node pointer */
+dtuple_t*
+dict_index_build_node_ptr(
+/*======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_t*		rec,	/*!< in: record for which to build node
+					pointer */
+	ulint			page_no,/*!< in: page number to put in node
+					pointer */
+	mem_heap_t*		heap,	/*!< in: memory heap where pointer
+					created */
+	ulint			level)	/*!< in: level of rec in tree:
+					0 means leaf level */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Convert a physical record into a search tuple.
+@param[in]	rec		index record (not necessarily in an index page)
+@param[in]	index		index
+@param[in]	leaf		whether rec is in a leaf page
+@param[in]	n_fields	number of data fields
+@param[in,out]	heap		memory heap for allocation
+@return own: data tuple */
+dtuple_t*
+dict_index_build_data_tuple(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	bool			leaf,
+	ulint			n_fields,
+	mem_heap_t*		heap)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Gets the page number of the root of the index tree.
+@return page number */
+UNIV_INLINE
+uint32_t
+dict_index_get_page(
+/*================*/
+	const dict_index_t*	tree)	/*!< in: index */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Returns free space reserved for future updates of records. This is
+relevant only in the case of many consecutive inserts, as updates
+which make the records bigger might fragment the index.
+@return number of free bytes on page, reserved for updates */
+UNIV_INLINE
+ulint
+dict_index_get_space_reserve(void);
+/*==============================*/
+
+/* Online index creation @{ */
+/********************************************************************//**
+Gets the status of online index creation.
+@return the status */
+UNIV_INLINE
+enum online_index_status
+dict_index_get_online_status(
+/*=========================*/
+	const dict_index_t*	index)	/*!< in: secondary index */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Sets the status of online index creation. */
+UNIV_INLINE
+void
+dict_index_set_online_status(
+/*=========================*/
+	dict_index_t*			index,	/*!< in/out: index */
+	enum online_index_status	status)	/*!< in: status */
+	MY_ATTRIBUTE((nonnull));
+/********************************************************************//**
+Determines if a secondary index is being or has been created online,
+or if the table is being rebuilt online, allowing concurrent modifications
+to the table.
+@retval true if the index is being or has been built online, or
+if this is a clustered index and the table is being or has been rebuilt online
+@retval false if the index has been created or the table has been
+rebuilt completely */
+UNIV_INLINE
+bool
+dict_index_is_online_ddl(
+/*=====================*/
+	const dict_index_t*	index)	/*!< in: index */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Calculates the minimum record length in an index. */
+ulint
+dict_index_calc_min_rec_len(
+/*========================*/
+	const dict_index_t*	index)	/*!< in: index */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/********************************************************************//**
+Checks if the database name in two table names is the same.
+@return TRUE if same db name */
+ibool
+dict_tables_have_same_db(
+/*=====================*/
+	const char*	name1,	/*!< in: table name in the form
+				dbname '/' tablename */
+	const char*	name2)	/*!< in: table name in the form
+				dbname '/' tablename */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Get an index by name.
+@param[in]	table		the table where to look for the index
+@param[in]	name		the index name to look for
+@return index, NULL if does not exist */
+dict_index_t*
+dict_table_get_index_on_name(dict_table_t* table, const char* name)
+		MY_ATTRIBUTE((warn_unused_result));
+
+/** Get an index by name.
+@param[in]	table		the table where to look for the index
+@param[in]	name		the index name to look for
+@return index, NULL if does not exist */
+inline
+const dict_index_t*
+dict_table_get_index_on_name(const dict_table_t* table, const char* name)
+{
+	return dict_table_get_index_on_name(const_cast<dict_table_t*>(table),
+					    name);
+}
+
+/***************************************************************
+Check whether a column exists in an FTS index. */
+UNIV_INLINE
+ulint
+dict_table_is_fts_column(
+/*=====================*/
+				/* out: ULINT_UNDEFINED if no match else
+				the offset within the vector */
+	ib_vector_t*	indexes,/* in: vector containing only FTS indexes */
+	ulint		col_no,	/* in: col number to search for */
+	bool		is_virtual)/*!< in: whether it is a virtual column */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Looks for an index with the given id given a table instance.
+@param[in]	table	table instance
+@param[in]	id	index id
+@return index or NULL */
+dict_index_t*
+dict_table_find_index_on_id(
+	const dict_table_t*	table,
+	index_id_t		id)
+	MY_ATTRIBUTE((nonnull(1)));
+
+/** Maximum number of columns in a foreign key constraint. Please Note MySQL
+has a much lower limit on the number of columns allowed in a foreign key
+constraint */
+#define MAX_NUM_FK_COLUMNS		500
+
+/* Buffers for storing detailed information about the latest foreign key
+and unique key errors */
+extern FILE*		dict_foreign_err_file;
+extern mysql_mutex_t dict_foreign_err_mutex;
+
+/** InnoDB data dictionary cache */
+class dict_sys_t
+{
+  /** The my_hrtime_coarse().val of the oldest lock_wait() start, or 0 */
+  std::atomic<ulonglong> latch_ex_wait_start;
+
+  /** the rw-latch protecting the data dictionary cache */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_lock latch;
+#ifdef UNIV_DEBUG
+  /** whether latch is being held in exclusive mode (by any thread) */
+  Atomic_relaxed<pthread_t> latch_ex;
+  /** number of S-latch holders */
+  Atomic_counter<uint32_t> latch_readers;
+#endif
+public:
+  /** Indexes of SYS_TABLE[] */
+  enum
+  {
+    SYS_TABLES= 0,
+    SYS_INDEXES,
+    SYS_COLUMNS,
+    SYS_FIELDS,
+    SYS_FOREIGN,
+    SYS_FOREIGN_COLS,
+    SYS_VIRTUAL
+  };
+  /** System table names */
+  static const span<const char> SYS_TABLE[];
+
+  /** all tables (persistent and temporary), hashed by name */
+  hash_table_t table_hash;
+  /** hash table of persistent table IDs */
+  hash_table_t table_id_hash;
+
+  /** the SYS_TABLES table */
+  dict_table_t *sys_tables;
+  /** the SYS_COLUMNS table */
+  dict_table_t *sys_columns;
+  /** the SYS_INDEXES table */
+  dict_table_t *sys_indexes;
+  /** the SYS_FIELDS table */
+  dict_table_t *sys_fields;
+  /** the SYS_FOREIGN table */
+  dict_table_t *sys_foreign;
+  /** the SYS_FOREIGN_COLS table */
+  dict_table_t *sys_foreign_cols;
+  /** the SYS_VIRTUAL table */
+  dict_table_t *sys_virtual;
+
+  /** @return whether all non-hard-coded system tables exist */
+  bool sys_tables_exist() const
+  { return UNIV_LIKELY(sys_foreign && sys_foreign_cols && sys_virtual); }
+
+  /** list of persistent tables that can be evicted */
+  UT_LIST_BASE_NODE_T(dict_table_t) table_LRU;
+  /** list of persistent tables that cannot be evicted */
+  UT_LIST_BASE_NODE_T(dict_table_t) table_non_LRU;
+
+private:
+  bool m_initialised= false;
+  /** the sequence of temporary table IDs */
+  std::atomic<table_id_t> temp_table_id{DICT_HDR_FIRST_ID};
+  /** hash table of temporary table IDs */
+  hash_table_t temp_id_hash;
+  /** the next value of DB_ROW_ID, backed by DICT_HDR_ROW_ID
+  (FIXME: remove this, and move to dict_table_t) */
+  Atomic_relaxed<row_id_t> row_id;
+  /** The synchronization interval of row_id */
+  static constexpr size_t ROW_ID_WRITE_MARGIN= 256;
+public:
+  /** Diagnostic message for exceeding the lock_wait() timeout */
+  static const char fatal_msg[];
+
+  /** @return A new value for GEN_CLUST_INDEX(DB_ROW_ID) */
+  inline row_id_t get_new_row_id();
+
+  /** Ensure that row_id is not smaller than id, on IMPORT TABLESPACE */
+  inline void update_row_id(row_id_t id);
+
+  /** Recover the global DB_ROW_ID sequence on database startup */
+  void recover_row_id(row_id_t id)
+  {
+    row_id= ut_uint64_align_up(id, ROW_ID_WRITE_MARGIN) + ROW_ID_WRITE_MARGIN;
+  }
+
+  /** @return a new temporary table ID */
+  table_id_t acquire_temporary_table_id()
+  {
+    return temp_table_id.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  /** Look up a temporary table.
+  @param id        temporary table ID
+  @return          temporary table
+  @retval nullptr  if the table does not exist
+  (should only happen during the rollback of CREATE...SELECT) */
+  dict_table_t *acquire_temporary_table(table_id_t id)
+  {
+    ut_ad(frozen());
+    dict_table_t *table;
+    ulint fold = ut_fold_ull(id);
+    HASH_SEARCH(id_hash, &temp_id_hash, fold, dict_table_t*, table,
+                ut_ad(table->cached), table->id == id);
+    if (UNIV_LIKELY(table != nullptr))
+    {
+      DBUG_ASSERT(table->is_temporary());
+      DBUG_ASSERT(table->id >= DICT_HDR_FIRST_ID);
+      table->acquire();
+    }
+    return table;
+  }
+
+  /** Look up a persistent table.
+  @param id     table ID
+  @return table
+  @retval nullptr if not cached */
+  dict_table_t *find_table(table_id_t id)
+  {
+    ut_ad(frozen());
+    dict_table_t *table;
+    ulint fold= ut_fold_ull(id);
+    HASH_SEARCH(id_hash, &table_id_hash, fold, dict_table_t*, table,
+                ut_ad(table->cached), table->id == id);
+    DBUG_ASSERT(!table || !table->is_temporary());
+    return table;
+  }
+
+  bool is_initialised() const { return m_initialised; }
+
+  /** Initialise the data dictionary cache. */
+  void create();
+
+  /** Close the data dictionary cache on shutdown. */
+  void close();
+
+  /** Resize the hash tables based on the current buffer pool size. */
+  void resize();
+
+  /** Add a table definition to the data dictionary cache */
+  inline void add(dict_table_t* table);
+  /** Remove a table definition from the data dictionary cache.
+  @param[in,out]	table	cached table definition to be evicted
+  @param[in]	lru	whether this is part of least-recently-used evictiono
+  @param[in]	keep	whether to keep (not free) the object */
+  void remove(dict_table_t* table, bool lru = false, bool keep = false);
+
+#ifdef UNIV_DEBUG
+  /** Find a table */
+  template <bool in_lru> bool find(const dict_table_t *table)
+  {
+    ut_ad(table);
+    ut_ad(table->can_be_evicted == in_lru);
+    ut_ad(frozen());
+    for (const dict_table_t* t= in_lru ? table_LRU.start : table_non_LRU.start;
+         t; t = UT_LIST_GET_NEXT(table_LRU, t))
+    {
+      if (t == table) return true;
+      ut_ad(t->can_be_evicted == in_lru);
+    }
+    return false;
+  }
+  /** Find a table */
+  bool find(const dict_table_t *table)
+  {
+    return table->can_be_evicted ? find<true>(table) : find<false>(table);
+  }
+#endif
+
+  /** Move a table to the non-LRU list from the LRU list. */
+  void prevent_eviction(dict_table_t *table)
+  {
+    ut_d(locked());
+    ut_ad(find(table));
+    if (!table->can_be_evicted)
+      return;
+    table->can_be_evicted= false;
+    UT_LIST_REMOVE(table_LRU, table);
+    UT_LIST_ADD_LAST(table_non_LRU, table);
+  }
+
+#ifdef UNIV_DEBUG
+  /** @return whether any thread (not necessarily the current thread)
+  is holding the latch; that is, this check may return false
+  positives */
+  bool frozen() const { return latch_readers || latch_ex; }
+  /** @return whether any thread (not necessarily the current thread)
+  is holding a shared latch */
+  bool frozen_not_locked() const { return latch_readers; }
+  /** @return whether the current thread holds the exclusive latch */
+  bool locked() const { return latch_ex == pthread_self(); }
+#endif
+private:
+  /** Acquire the exclusive latch */
+  ATTRIBUTE_NOINLINE
+  void lock_wait(SRW_LOCK_ARGS(const char *file, unsigned line));
+public:
+  /** @return the my_hrtime_coarse().val of the oldest lock_wait() start,
+  assuming that requests are served on a FIFO basis */
+  ulonglong oldest_wait() const
+  { return latch_ex_wait_start.load(std::memory_order_relaxed); }
+
+  /** Exclusively lock the dictionary cache. */
+  void lock(SRW_LOCK_ARGS(const char *file, unsigned line))
+  {
+    if (latch.wr_lock_try())
+    {
+      ut_ad(!latch_readers);
+      ut_ad(!latch_ex);
+      ut_d(latch_ex= pthread_self());
+    }
+    else
+      lock_wait(SRW_LOCK_ARGS(file, line));
+  }
+
+#ifdef UNIV_PFS_RWLOCK
+  /** Unlock the data dictionary cache. */
+  ATTRIBUTE_NOINLINE void unlock();
+  /** Acquire a shared lock on the dictionary cache. */
+  ATTRIBUTE_NOINLINE void freeze(const char *file, unsigned line);
+  /** Release a shared lock on the dictionary cache. */
+  ATTRIBUTE_NOINLINE void unfreeze();
+#else
+  /** Unlock the data dictionary cache. */
+  void unlock()
+  {
+    ut_ad(latch_ex == pthread_self());
+    ut_ad(!latch_readers);
+    ut_d(latch_ex= 0);
+    latch.wr_unlock();
+  }
+  /** Acquire a shared lock on the dictionary cache. */
+  void freeze()
+  {
+    latch.rd_lock();
+    ut_ad(!latch_ex);
+    ut_d(latch_readers++);
+  }
+  /** Release a shared lock on the dictionary cache. */
+  void unfreeze()
+  {
+    ut_ad(!latch_ex);
+    ut_ad(latch_readers--);
+    latch.rd_unlock();
+  }
+#endif
+
+  /** Estimate the used memory occupied by the data dictionary
+  table and index objects.
+  @return number of bytes occupied */
+  TPOOL_SUPPRESS_TSAN ulint rough_size() const
+  {
+    /* No latch; this is a very crude approximation anyway */
+    ulint size = UT_LIST_GET_LEN(table_LRU) + UT_LIST_GET_LEN(table_non_LRU);
+    size *= sizeof(dict_table_t)
+      + sizeof(dict_index_t) * 2
+      + (sizeof(dict_col_t) + sizeof(dict_field_t)) * 10
+      + sizeof(dict_field_t) * 5 /* total number of key fields */
+      + 200; /* arbitrary, covering names and overhead */
+    size += (table_hash.n_cells + table_id_hash.n_cells +
+             temp_id_hash.n_cells) * sizeof(hash_cell_t);
+    return size;
+  }
+
+  /** Evict unused, unlocked tables from table_LRU.
+  @param half whether to consider half the tables only (instead of all)
+  @return number of tables evicted */
+  ulint evict_table_LRU(bool half);
+
+  /** Look up a table in the dictionary cache.
+  @param name   table name
+  @return table handle
+  @retval nullptr if not found */
+  dict_table_t *find_table(const span<const char> &name) const
+  {
+    ut_ad(frozen());
+    for (dict_table_t *table= static_cast<dict_table_t*>
+         (HASH_GET_FIRST(&table_hash, table_hash.calc_hash
+                         (my_crc32c(0, name.data(), name.size()))));
+         table; table= table->name_hash)
+      if (strlen(table->name.m_name) == name.size() &&
+          !memcmp(table->name.m_name, name.data(), name.size()))
+        return table;
+    return nullptr;
+  }
+
+  /** Look up or load a table definition
+  @param name   table name
+  @param ignore errors to ignore when loading the table definition
+  @return table handle
+  @retval nullptr if not found */
+  dict_table_t *load_table(const span<const char> &name,
+                           dict_err_ignore_t ignore= DICT_ERR_IGNORE_NONE);
+
+  /** Attempt to load the system tables on startup
+  @return whether any discrepancy with the expected definition was found */
+  bool load_sys_tables();
+  /** Create or check system tables on startup */
+  dberr_t create_or_check_sys_tables();
+};
+
+/** the data dictionary cache */
+extern dict_sys_t	dict_sys;
+
+/*********************************************************************//**
+Converts a database and table name from filesystem encoding
+(e.g. d@i1b/a@q1b@1Kc, same format as used in dict_table_t::name) in two
+strings in UTF8 encoding (e.g. dцb and aюbØc). The output buffers must be
+at least MAX_DB_UTF8_LEN and MAX_TABLE_UTF8_LEN bytes. */
+void
+dict_fs2utf8(
+/*=========*/
+	const char*	db_and_table,	/*!< in: database and table names,
+					e.g. d@i1b/a@q1b@1Kc */
+	char*		db_utf8,	/*!< out: database name, e.g. dцb */
+	size_t		db_utf8_size,	/*!< in: dbname_utf8 size */
+	char*		table_utf8,	/*!< out: table name, e.g. aюbØc */
+	size_t		table_utf8_size)/*!< in: table_utf8 size */
+	MY_ATTRIBUTE((nonnull));
+
+/** Flag an index corrupted both in the data dictionary cache
+and in the system table SYS_INDEXES.
+@param index       index to be flagged as corrupted
+@param ctx         context (for error log reporting) */
+void dict_set_corrupted(dict_index_t *index, const char *ctx)
+  ATTRIBUTE_COLD __attribute__((nonnull));
+
+/** Sets merge_threshold in the SYS_INDEXES
+@param[in,out]	index		index
+@param[in]	merge_threshold	value to set */
+void
+dict_index_set_merge_threshold(
+	dict_index_t*	index,
+	ulint		merge_threshold);
+
+#ifdef UNIV_DEBUG
+/** Sets merge_threshold for all indexes in dictionary cache for debug.
+@param[in]	merge_threshold_all	value to set for all indexes */
+void
+dict_set_merge_threshold_all_debug(
+	uint	merge_threshold_all);
+#endif /* UNIV_DEBUG */
+
+/** Validate the table flags.
+@param[in]	flags	Table flags
+@return true if valid. */
+UNIV_INLINE
+bool
+dict_tf_is_valid(
+	ulint	flags);
+
+/** Validate both table flags and table flags2 and make sure they
+are compatible.
+@param[in]	flags	Table flags
+@param[in]	flags2	Table flags2
+@return true if valid. */
+UNIV_INLINE
+bool
+dict_tf2_is_valid(
+	ulint	flags,
+	ulint	flags2);
+
+/*********************************************************************//**
+This function should be called whenever a page is successfully
+compressed. Updates the compression padding information. */
+void
+dict_index_zip_success(
+/*===================*/
+	dict_index_t*	index)	/*!< in/out: index to be updated. */
+	MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+This function should be called whenever a page compression attempt
+fails. Updates the compression padding information. */
+void
+dict_index_zip_failure(
+/*===================*/
+	dict_index_t*	index)	/*!< in/out: index to be updated. */
+	MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Return the optimal page size, for which page will likely compress.
+@return page size beyond which page may not compress*/
+ulint
+dict_index_zip_pad_optimal_page_size(
+/*=================================*/
+	dict_index_t*	index)	/*!< in: index for which page size
+				is requested */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Convert table flag to row format string.
+@return row format name */
+const char*
+dict_tf_to_row_format_string(
+/*=========================*/
+	ulint	table_flag);		/*!< in: row format setting */
+
+/** encode number of columns and number of virtual columns in one
+4 bytes value. We could do this because the number of columns in
+InnoDB is limited to 1017
+@param[in]	n_col	number of non-virtual column
+@param[in]	n_v_col	number of virtual column
+@return encoded value */
+UNIV_INLINE
+ulint
+dict_table_encode_n_col(
+	ulint	n_col,
+	ulint	n_v_col);
+
+/** Decode number of virtual and non-virtual columns in one 4 bytes value.
+@param[in]	encoded	encoded value
+@param[in,out]	n_col	number of non-virtual column
+@param[in,out]	n_v_col	number of virtual column */
+UNIV_INLINE
+void
+dict_table_decode_n_col(
+	ulint	encoded,
+	ulint*	n_col,
+	ulint*	n_v_col);
+
+/** Free the virtual column template
+@param[in,out]	vc_templ	virtual column template */
+UNIV_INLINE
+void
+dict_free_vc_templ(
+	dict_vcol_templ_t*	vc_templ);
+
+/** Check whether the table have virtual index.
+@param[in]	table	InnoDB table
+@return true if the table have virtual index, false otherwise. */
+UNIV_INLINE
+bool
+dict_table_have_virtual_index(
+	dict_table_t*	table);
+
+#include "dict0dict.inl"
+
+#endif
diff --git a/storage/innobase/include/dict0dict.inl b/storage/innobase/include/dict0dict.inl
new file mode 100644
index 00000000..4cc3eae9
--- /dev/null
+++ b/storage/innobase/include/dict0dict.inl
@@ -0,0 +1,1217 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0dict.ic
+Data dictionary system
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "fsp0sysspace.h"
+#include "dict0pagecompress.h"
+
+/*********************************************************************//**
+Gets the minimum number of bytes per character.
+@return minimum multi-byte char size, in bytes */
+UNIV_INLINE
+unsigned
+dict_col_get_mbminlen(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+{
+	return col->mbminlen;
+}
+/*********************************************************************//**
+Gets the maximum number of bytes per character.
+@return maximum multi-byte char size, in bytes */
+UNIV_INLINE
+unsigned
+dict_col_get_mbmaxlen(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+{
+	return col->mbmaxlen;
+}
+/*********************************************************************//**
+Gets the column data type. */
+UNIV_INLINE
+void
+dict_col_copy_type(
+/*===============*/
+	const dict_col_t*	col,	/*!< in: column */
+	dtype_t*		type)	/*!< out: data type */
+{
+	ut_ad(col != NULL);
+	ut_ad(type != NULL);
+
+	type->mtype = col->mtype;
+	type->prtype = col->prtype;
+	type->len = col->len;
+	type->mbminlen = col->mbminlen;
+	type->mbmaxlen = col->mbmaxlen;
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Assert that a column and a data type match.
+@return TRUE */
+UNIV_INLINE
+ibool
+dict_col_type_assert_equal(
+/*=======================*/
+	const dict_col_t*	col,	/*!< in: column */
+	const dtype_t*		type)	/*!< in: data type */
+{
+	ut_ad(col->mtype == type->mtype);
+	ut_ad(col->prtype == type->prtype);
+	//ut_ad(col->len == type->len);
+	ut_ad(col->mbminlen == type->mbminlen);
+	ut_ad(col->mbmaxlen == type->mbmaxlen);
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/***********************************************************************//**
+Returns the minimum size of the column.
+@return minimum size */
+UNIV_INLINE
+unsigned
+dict_col_get_min_size(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+{
+	return(dtype_get_min_size_low(col->mtype, col->prtype, col->len,
+				      col->mbminlen, col->mbmaxlen));
+}
+/***********************************************************************//**
+Returns the maximum size of the column.
+@return maximum size */
+UNIV_INLINE
+ulint
+dict_col_get_max_size(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+{
+	return(dtype_get_max_size_low(col->mtype, col->len));
+}
+/***********************************************************************//**
+Returns the size of a fixed size column, 0 if not a fixed size column.
+@return fixed size, or 0 */
+UNIV_INLINE
+unsigned
+dict_col_get_fixed_size(
+/*====================*/
+	const dict_col_t*	col,	/*!< in: column */
+	ulint			comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+	return(dtype_get_fixed_size_low(col->mtype, col->prtype, col->len,
+					col->mbminlen, col->mbmaxlen, comp));
+}
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+unsigned
+dict_col_get_sql_null_size(
+/*=======================*/
+	const dict_col_t*	col,	/*!< in: column */
+	ulint			comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+{
+	return(dict_col_get_fixed_size(col, comp));
+}
+
+/*********************************************************************//**
+Gets the column number.
+@return col->ind, table column position (starting from 0) */
+UNIV_INLINE
+unsigned
+dict_col_get_no(
+/*============*/
+	const dict_col_t*	col)	/*!< in: column */
+{
+	return(col->ind);
+}
+
+/*********************************************************************//**
+Gets the column position in the clustered index. */
+UNIV_INLINE
+ulint
+dict_col_get_clust_pos(
+/*===================*/
+	const dict_col_t*	col,		/*!< in: table column */
+	const dict_index_t*	clust_index)	/*!< in: clustered index */
+{
+	ulint	i;
+
+	ut_ad(dict_index_is_clust(clust_index));
+
+	for (i = 0; i < clust_index->n_def; i++) {
+		const dict_field_t*	field = &clust_index->fields[i];
+
+		if (!field->prefix_len && field->col == col) {
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/** Gets the column position in the given index.
+@param[in]	col	table column
+@param[in]	index	index to be searched for column
+@return position of column in the given index. */
+UNIV_INLINE
+ulint
+dict_col_get_index_pos(
+	const dict_col_t*	col,
+	const dict_index_t*	index)
+{
+	ulint	i;
+
+	for (i = 0; i < index->n_def; i++) {
+		const dict_field_t*	field = &index->fields[i];
+
+		if (!field->prefix_len && field->col == col) {
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the first index on the table (the clustered index).
+@return index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_first_index(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	return(UT_LIST_GET_FIRST(((dict_table_t*) table)->indexes));
+}
+
+/********************************************************************//**
+Gets the last index on the table.
+@return index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_last_index(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	return(UT_LIST_GET_LAST((const_cast<dict_table_t*>(table))
+				->indexes));
+}
+
+/********************************************************************//**
+Gets the next index on the table.
+@return index, NULL if none left */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_next_index(
+/*======================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	return(UT_LIST_GET_NEXT(indexes, (dict_index_t*) index));
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Gets the number of user-defined non-virtual columns in a table in the
+dictionary cache.
+@return number of user-defined (e.g., not ROW_ID) non-virtual
+columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_user_cols(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	/* n_cols counts stored columns only. A table may contain
+	virtual columns and no user-specified stored columns at all. */
+	ut_ad(table->n_cols >= DATA_N_SYS_COLS);
+	return unsigned(table->n_cols) - DATA_N_SYS_COLS;
+}
+
+/********************************************************************//**
+Gets the number of all non-virtual columns (also system) in a table
+in the dictionary cache.
+@return number of non-virtual columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_cols(
+/*==================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	return(table->n_cols);
+}
+
+/** Gets the number of virtual columns in a table in the dictionary cache.
+@param[in]	table	the table to check
+@return number of virtual columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_v_cols(
+	const dict_table_t*	table)
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	return(table->n_v_cols);
+}
+
+/** Check if a table has indexed virtual columns
+@param[in]	table	the table to check
+@return true is the table has indexed virtual columns */
+UNIV_INLINE
+bool
+dict_table_has_indexed_v_cols(
+	const dict_table_t*	table)
+{
+
+	for (unsigned i = 0; i < table->n_v_cols; i++) {
+		const dict_v_col_t*     col = dict_table_get_nth_v_col(table, i);
+		if (col->m_col.ord_part) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/********************************************************************//**
+Gets the approximately estimated number of rows in the table.
+@return estimated number of rows */
+UNIV_INLINE
+ib_uint64_t
+dict_table_get_n_rows(
+/*==================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table->stat_initialized);
+
+	return(table->stat_n_rows);
+}
+
+/********************************************************************//**
+Increment the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_inc(
+/*==================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	if (table->stat_initialized) {
+		ib_uint64_t	n_rows = table->stat_n_rows;
+		if (n_rows < 0xFFFFFFFFFFFFFFFFULL) {
+			table->stat_n_rows = n_rows + 1;
+		}
+	}
+}
+
+/********************************************************************//**
+Decrement the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_dec(
+/*==================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	if (table->stat_initialized) {
+		ib_uint64_t	n_rows = table->stat_n_rows;
+		if (n_rows > 0) {
+			table->stat_n_rows = n_rows - 1;
+		}
+	}
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_nth_col(
+/*===================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			pos)	/*!< in: position of column */
+{
+	ut_ad(pos < table->n_def);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	return((dict_col_t*) (table->cols) + pos);
+}
+
+/** Gets the nth virtual column of a table.
+@param[in]	table	table
+@param[in]	pos	position of virtual column
+@return pointer to virtual column object */
+UNIV_INLINE
+dict_v_col_t*
+dict_table_get_nth_v_col(
+	const dict_table_t*	table,
+	ulint			pos)
+{
+	ut_ad(table);
+	ut_ad(pos < table->n_v_def);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_ad(!table->v_cols[pos].m_col.is_added());
+	ut_ad(!table->v_cols[pos].m_col.is_dropped());
+	return &table->v_cols[pos];
+}
+
+/********************************************************************//**
+Gets the given system column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_sys_col(
+/*===================*/
+	const dict_table_t*	table,	/*!< in: table */
+	unsigned		sys)	/*!< in: DATA_ROW_ID, ... */
+{
+	dict_col_t*	col;
+	col = dict_table_get_nth_col(table,
+				     dict_table_get_sys_col_no(table, sys));
+	ut_ad(col->mtype == DATA_SYS);
+	ut_ad(col->prtype == (sys | DATA_NOT_NULL));
+
+	return(col);
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Gets the given system column number of a table.
+@return column number */
+UNIV_INLINE
+unsigned
+dict_table_get_sys_col_no(
+/*======================*/
+	const dict_table_t*	table,	/*!< in: table */
+	unsigned		sys)	/*!< in: DATA_ROW_ID, ... */
+{
+	ut_ad(sys < DATA_N_SYS_COLS);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	return unsigned(table->n_cols) + (sys - DATA_N_SYS_COLS);
+}
+
+/************************************************************************
+Check if the table has an FTS index. */
+UNIV_INLINE
+ibool
+dict_table_has_fts_index(
+/*=====================*/
+				/* out: TRUE if table has an FTS index */
+	dict_table_t*   table)  /* in: table */
+{
+	return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS));
+}
+
+/** Validate the flags for tables that are not ROW_FORMAT=REDUNDANT.
+@param[in]	flags		table flags
+@return whether the flags are valid */
+inline
+bool
+dict_tf_is_valid_not_redundant(ulint flags)
+{
+	const bool	atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(flags);
+
+	ulint	zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags);
+
+	if (!zip_ssize) {
+		/* Not ROW_FORMAT=COMPRESSED */
+	} else if (!atomic_blobs) {
+		/* ROW_FORMAT=COMPRESSED implies ROW_FORMAT=DYNAMIC
+		for the uncompressed page format */
+		return(false);
+	} else if (zip_ssize > PAGE_ZIP_SSIZE_MAX
+		   || zip_ssize > srv_page_size_shift
+		   || srv_page_size_shift > UNIV_ZIP_SIZE_SHIFT_MAX) {
+		/* KEY_BLOCK_SIZE is out of bounds, or
+		ROW_FORMAT=COMPRESSED is not supported with this
+		innodb_page_size (only up to 16KiB) */
+		return(false);
+	}
+
+	switch (DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags)) {
+	case 0:
+		/* PAGE_COMPRESSION_LEVEL=0 should imply PAGE_COMPRESSED=NO */
+		return(!DICT_TF_GET_PAGE_COMPRESSION(flags));
+	case 1: case 2: case 3: case 4: case 5: case 6: case 7: case 8: case 9:
+		/* PAGE_COMPRESSION_LEVEL requires
+		ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC
+		(not ROW_FORMAT=COMPRESSED or ROW_FORMAT=REDUNDANT)
+		and PAGE_COMPRESSED=YES */
+		return(!zip_ssize && DICT_TF_GET_PAGE_COMPRESSION(flags));
+	default:
+		/* Invalid PAGE_COMPRESSION_LEVEL value */
+		return(false);
+	}
+}
+
+/** Validate the table flags.
+@param[in]	flags	Table flags
+@return true if valid. */
+UNIV_INLINE
+bool
+dict_tf_is_valid(
+	ulint	flags)
+{
+	ut_ad(flags < 1U << DICT_TF_BITS);
+	/* The DATA_DIRECTORY flag can be assigned fully independently
+	of all other persistent table flags. */
+	flags &= ~DICT_TF_MASK_DATA_DIR;
+	if (!(flags & 1)) {
+		/* Only ROW_FORMAT=REDUNDANT has 0 in the least significant
+		bit. For ROW_FORMAT=REDUNDANT, only the DATA_DIR flag
+		(which we cleared above) can be set. If any other flags
+		are set, the flags are invalid. */
+		return(flags == 0 || flags == DICT_TF_MASK_NO_ROLLBACK);
+	}
+
+	return(dict_tf_is_valid_not_redundant(flags));
+}
+
+/** Validate both table flags and table flags2 and make sure they
+are compatible.
+@param[in]	flags	Table flags
+@param[in]	flags2	Table flags2
+@return true if valid. */
+UNIV_INLINE
+bool
+dict_tf2_is_valid(
+	ulint	flags,
+	ulint	flags2)
+{
+	if (!dict_tf_is_valid(flags)) {
+		return(false);
+	}
+
+	if ((flags2 & DICT_TF2_UNUSED_BIT_MASK) != 0) {
+		return(false);
+	}
+
+	return(true);
+}
+
+/********************************************************************//**
+Determine the file format from dict_table_t::flags
+The low order bit will be zero for REDUNDANT and 1 for COMPACT. For any
+other row_format, file_format is > 0 and DICT_TF_COMPACT will also be set.
+@return file format version */
+UNIV_INLINE
+rec_format_t
+dict_tf_get_rec_format(
+/*===================*/
+	ulint		flags)	/*!< in: dict_table_t::flags */
+{
+	ut_a(dict_tf_is_valid(flags));
+
+	if (!DICT_TF_GET_COMPACT(flags)) {
+		return(REC_FORMAT_REDUNDANT);
+	}
+
+	if (!DICT_TF_HAS_ATOMIC_BLOBS(flags)) {
+		return(REC_FORMAT_COMPACT);
+	}
+
+	if (DICT_TF_GET_ZIP_SSIZE(flags)) {
+		return(REC_FORMAT_COMPRESSED);
+	}
+
+	return(REC_FORMAT_DYNAMIC);
+}
+
+/** Set the various values in a dict_table_t::flags pointer.
+@param[in,out]	flags,		Pointer to a 4 byte Table Flags
+@param[in]	format		File Format
+@param[in]	zip_ssize	Zip Shift Size
+@param[in]	use_data_dir	Table uses DATA DIRECTORY
+@param[in]	page_compressed Table uses page compression
+@param[in]	page_compression_level Page compression level */
+UNIV_INLINE
+void
+dict_tf_set(
+/*========*/
+	ulint*		flags,
+	rec_format_t	format,
+	ulint		zip_ssize,
+	bool		use_data_dir,
+	bool		page_compressed,
+	ulint		page_compression_level)
+{
+	*flags = use_data_dir ? 1 << DICT_TF_POS_DATA_DIR : 0;
+
+	switch (format) {
+	case REC_FORMAT_REDUNDANT:
+		ut_ad(zip_ssize == 0);
+		/* no other options are allowed */
+		ut_ad(!page_compressed);
+		return;
+	case REC_FORMAT_COMPACT:
+		*flags |= DICT_TF_COMPACT;
+		ut_ad(zip_ssize == 0);
+		break;
+	case REC_FORMAT_COMPRESSED:
+		*flags |= DICT_TF_COMPACT
+			| (1 << DICT_TF_POS_ATOMIC_BLOBS)
+			| (zip_ssize << DICT_TF_POS_ZIP_SSIZE);
+		break;
+	case REC_FORMAT_DYNAMIC:
+		*flags |= DICT_TF_COMPACT
+			| (1 << DICT_TF_POS_ATOMIC_BLOBS);
+		ut_ad(zip_ssize == 0);
+		break;
+	}
+
+	if (page_compressed) {
+		*flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS)
+		       | (1 << DICT_TF_POS_PAGE_COMPRESSION)
+		       | (page_compression_level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL);
+
+		ut_ad(zip_ssize == 0);
+		ut_ad(dict_tf_get_page_compression(*flags) == TRUE);
+		ut_ad(dict_tf_get_page_compression_level(*flags) == page_compression_level);
+	}
+}
+
+/** Convert a 32 bit integer table flags to the 32 bit FSP Flags.
+Fsp Flags are written into the tablespace header at the offset
+FSP_SPACE_FLAGS and are also stored in the fil_space_t::flags field.
+The following chart shows the translation of the low order bit.
+Other bits are the same.
+========================= Low order bit ==========================
+                    | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC
+dict_table_t::flags |     0     |    1    |     1      |    1
+fil_space_t::flags  |     0     |    0    |     1      |    1
+==================================================================
+@param[in]	table_flags	dict_table_t::flags
+@return tablespace flags (fil_space_t::flags) */
+inline uint32_t dict_tf_to_fsp_flags(unsigned table_flags)
+{
+	uint32_t fsp_flags;
+	uint32_t page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(
+		table_flags);
+
+	ut_ad((DICT_TF_GET_PAGE_COMPRESSION(table_flags) == 0)
+	      == (page_compression_level == 0));
+
+	DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure", return UINT32_MAX;);
+
+	/* No ROW_FORMAT=COMPRESSED for innodb_checksum_algorithm=full_crc32 */
+	if ((srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32
+	     || srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_FULL_CRC32)
+	    && !(table_flags & DICT_TF_MASK_ZIP_SSIZE)) {
+
+		fsp_flags = 1U << FSP_FLAGS_FCRC32_POS_MARKER
+			| FSP_FLAGS_FCRC32_PAGE_SSIZE();
+
+		if (page_compression_level) {
+			fsp_flags |= static_cast<uint32_t>(
+				innodb_compression_algorithm)
+				<< FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO;
+		}
+	} else {
+		/* Adjust bit zero. */
+		fsp_flags = DICT_TF_HAS_ATOMIC_BLOBS(table_flags) ? 1 : 0;
+
+		/* ZIP_SSIZE and ATOMIC_BLOBS are at the same position. */
+		fsp_flags |= table_flags
+			& (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS);
+
+		fsp_flags |= FSP_FLAGS_PAGE_SSIZE();
+
+		if (page_compression_level) {
+			fsp_flags |= FSP_FLAGS_MASK_PAGE_COMPRESSION;
+		}
+	}
+
+	ut_a(fil_space_t::is_valid_flags(fsp_flags, false));
+
+	if (DICT_TF_HAS_DATA_DIR(table_flags)) {
+		fsp_flags |= 1U << FSP_FLAGS_MEM_DATA_DIR;
+	}
+
+	fsp_flags |= page_compression_level << FSP_FLAGS_MEM_COMPRESSION_LEVEL;
+
+	return(fsp_flags);
+}
+
+/********************************************************************//**
+Convert a 32 bit integer table flags to the 32bit integer that is written
+to a SYS_TABLES.TYPE field. The following chart shows the translation of
+the low order bit.  Other bits are the same.
+========================= Low order bit ==========================
+                    | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC
+dict_table_t::flags |     0     |    1    |     1
+SYS_TABLES.TYPE     |     1     |    1    |     1
+==================================================================
+@return ulint containing SYS_TABLES.TYPE */
+UNIV_INLINE
+ulint
+dict_tf_to_sys_tables_type(
+/*=======================*/
+	ulint	flags)	/*!< in: dict_table_t::flags */
+{
+	ulint type;
+
+	ut_a(dict_tf_is_valid(flags));
+
+	/* Adjust bit zero. It is always 1 in SYS_TABLES.TYPE */
+	type = 1;
+
+	/* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION,
+	PAGE_COMPRESSION_LEVEL are the same. */
+	type |= flags & (DICT_TF_MASK_ZIP_SSIZE
+			 | DICT_TF_MASK_ATOMIC_BLOBS
+			 | DICT_TF_MASK_DATA_DIR
+			 | DICT_TF_MASK_PAGE_COMPRESSION
+			 | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL
+			 | DICT_TF_MASK_NO_ROLLBACK);
+
+	return(type);
+}
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index,
+including fields added by the dictionary system.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_fields(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: an internal
+					representation of index (in
+					the dictionary cache) */
+{
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	return(index->n_fields);
+}
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+that uniquely determine the position of an index entry in the index, if
+we do not take multiversioning into account: in the B-tree use the value
+returned by dict_index_get_n_unique_in_tree.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+{
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	ut_ad(index->cached);
+	return(index->n_uniq);
+}
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+which uniquely determine the position of an index entry in the index, if
+we also take multiversioning into account.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique_in_tree(
+/*============================*/
+	const dict_index_t*	index)	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+{
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	ut_ad(index->cached);
+
+	if (dict_index_is_clust(index)) {
+
+		return(dict_index_get_n_unique(index));
+	}
+
+	return(dict_index_get_n_fields(index));
+}
+
+/**
+Gets the number of fields on nonleaf page level in the internal representation
+of an index which uniquely determine the position of an index entry in the
+index, if we also take multiversioning into account. Note, it doesn't
+include page no field.
+@param[in]	index	index
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique_in_tree_nonleaf(
+	const dict_index_t*	index)
+{
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	ut_ad(index->cached);
+
+	if (dict_index_is_spatial(index)) {
+		/* For spatial index, on non-leaf page, we have only
+		2 fields(mbr+page_no). So, except page no field,
+		there's one field there. */
+		return(DICT_INDEX_SPATIAL_NODEPTR_SIZE);
+	} else {
+		return(dict_index_get_n_unique_in_tree(index));
+	}
+}
+
+/********************************************************************//**
+Gets the number of user-defined ordering fields in the index. In the internal
+representation of clustered indexes we add the row id to the ordering fields
+to make a clustered index unique, but this function returns the number of
+fields the user defined in the index as ordering fields.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_ordering_defined_by_user(
+/*======================================*/
+	const dict_index_t*	index)	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+{
+	return(index->n_user_defined_cols);
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth field of an index.
+@return pointer to field object */
+UNIV_INLINE
+dict_field_t*
+dict_index_get_nth_field(
+/*=====================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos)	/*!< in: position of field */
+{
+	ut_ad(pos < index->n_def);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return((dict_field_t*) (index->fields) + pos);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Gets the field column.
+@return field->col, pointer to the table column */
+UNIV_INLINE
+const dict_col_t*
+dict_field_get_col(
+/*===============*/
+	const dict_field_t*	field)	/*!< in: index field */
+{
+	return(field->col);
+}
+
+/********************************************************************//**
+Gets pointer to the nth column in an index.
+@return column */
+UNIV_INLINE
+const dict_col_t*
+dict_index_get_nth_col(
+/*===================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos)	/*!< in: position of the field */
+{
+	return(dict_field_get_col(dict_index_get_nth_field(index, pos)));
+}
+
+/********************************************************************//**
+Gets the column number the nth field in an index.
+@return column number */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_no(
+/*======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos)	/*!< in: position of the field */
+{
+	return(dict_col_get_no(dict_index_get_nth_col(index, pos)));
+}
+
+/********************************************************************//**
+Looks for column n in an index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_pos(
+/*=======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			n,	/*!< in: column number */
+	ulint*			prefix_col_pos) /*!< out: col num if prefix */
+{
+	return(dict_index_get_nth_col_or_prefix_pos(index, n, false, false,
+						    prefix_col_pos));
+}
+
+/********************************************************************//**
+Returns the minimum data size of an index record.
+@return minimum data size in bytes */
+UNIV_INLINE
+unsigned
+dict_index_get_min_size(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+  unsigned n= dict_index_get_n_fields(index);
+  unsigned size= 0;
+
+  while (n--)
+    size+= dict_col_get_min_size(dict_index_get_nth_col(index, n));
+
+  return size;
+}
+
+/*********************************************************************//**
+Gets the page number of the root of the index tree.
+@return page number */
+UNIV_INLINE
+uint32_t
+dict_index_get_page(
+/*================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return(index->page);
+}
+
+/********************************************************************//**
+Returns free space reserved for future updates of records. This is
+relevant only in the case of many consecutive inserts, as updates
+which make the records bigger might fragment the index.
+@return number of free bytes on page, reserved for updates */
+UNIV_INLINE
+ulint
+dict_index_get_space_reserve(void)
+/*==============================*/
+{
+	return(srv_page_size / 16);
+}
+
+/********************************************************************//**
+Gets the status of online index creation.
+@return the status */
+UNIV_INLINE
+enum online_index_status
+dict_index_get_online_status(
+/*=========================*/
+	const dict_index_t*	index)	/*!< in: secondary index */
+{
+	enum online_index_status	status;
+
+	status = (enum online_index_status) index->online_status;
+
+	/* Without the index->lock protection, the online
+	status can change from ONLINE_INDEX_CREATION to
+	ONLINE_INDEX_COMPLETE (or ONLINE_INDEX_ABORTED) in
+	row_log_apply() once log application is done. So to make
+	sure the status is ONLINE_INDEX_CREATION or ONLINE_INDEX_COMPLETE
+	you should always do the recheck after acquiring index->lock */
+
+#ifdef UNIV_DEBUG
+	switch (status) {
+	case ONLINE_INDEX_COMPLETE:
+	case ONLINE_INDEX_CREATION:
+	case ONLINE_INDEX_ABORTED:
+	case ONLINE_INDEX_ABORTED_DROPPED:
+		return(status);
+	}
+	ut_error;
+#endif /* UNIV_DEBUG */
+	return(status);
+}
+
+/********************************************************************//**
+Sets the status of online index creation. */
+UNIV_INLINE
+void
+dict_index_set_online_status(
+/*=========================*/
+	dict_index_t*			index,	/*!< in/out: index */
+	enum online_index_status	status)	/*!< in: status */
+{
+	ut_ad(!(index->type & DICT_FTS));
+	ut_ad(index->lock.have_x());
+
+#ifdef UNIV_DEBUG
+	switch (dict_index_get_online_status(index)) {
+	case ONLINE_INDEX_COMPLETE:
+	case ONLINE_INDEX_CREATION:
+		break;
+	case ONLINE_INDEX_ABORTED:
+		ut_ad(status == ONLINE_INDEX_ABORTED_DROPPED);
+		break;
+	case ONLINE_INDEX_ABORTED_DROPPED:
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
+
+	index->online_status = status & 3;
+	ut_ad(dict_index_get_online_status(index) == status);
+}
+
+/********************************************************************//**
+Determines if a secondary index is being or has been created online,
+or if the table is being rebuilt online, allowing concurrent modifications
+to the table.
+@retval true if the index is being or has been built online, or
+if this is a clustered index and the table is being or has been rebuilt online
+@retval false if the index has been created or the table has been
+rebuilt completely */
+UNIV_INLINE
+bool
+dict_index_is_online_ddl(
+/*=====================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+#ifdef UNIV_DEBUG
+	if (dict_index_is_clust(index)) {
+		switch (dict_index_get_online_status(index)) {
+		case ONLINE_INDEX_CREATION:
+			return(true);
+		case ONLINE_INDEX_COMPLETE:
+			return(false);
+		case ONLINE_INDEX_ABORTED:
+		case ONLINE_INDEX_ABORTED_DROPPED:
+			break;
+		}
+		ut_ad(0);
+		return(false);
+	}
+#endif /* UNIV_DEBUG */
+
+	return(UNIV_UNLIKELY(dict_index_get_online_status(index)
+			     != ONLINE_INDEX_COMPLETE));
+}
+
+/**********************************************************************//**
+Check whether a column exists in an FTS index.
+@return ULINT_UNDEFINED if no match else the offset within the vector */
+UNIV_INLINE
+ulint
+dict_table_is_fts_column(
+/*=====================*/
+	ib_vector_t*	indexes,/*!< in: vector containing only FTS indexes */
+	ulint		col_no,	/*!< in: col number to search for */
+	bool		is_virtual) /*!< in: whether it is a virtual column */
+
+{
+	ulint		i;
+
+	for (i = 0; i < ib_vector_size(indexes); ++i) {
+		dict_index_t*	index;
+
+		index = (dict_index_t*) ib_vector_getp(indexes, i);
+
+		if (index->contains_col_or_prefix(col_no, is_virtual)) {
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Determine bytes of column prefix to be stored in the undo log. Please
+note that if !dict_table_has_atomic_blobs(table), no prefix
+needs to be stored in the undo log.
+@return bytes of column prefix to be stored in the undo log */
+UNIV_INLINE
+ulint
+dict_max_field_len_store_undo(
+/*==========================*/
+	dict_table_t*		table,	/*!< in: table */
+	const dict_col_t*	col)	/*!< in: column which index prefix
+					is based on */
+{
+	if (!dict_table_has_atomic_blobs(table)) {
+		return(0);
+	}
+
+	if (col->max_prefix != 0) {
+		return(col->max_prefix);
+	}
+
+	return(REC_VERSION_56_MAX_INDEX_COL_LEN);
+}
+
+/** Determine maximum bytes of a virtual column need to be stored
+in the undo log.
+@param[in]	table		dict_table_t for the table
+@param[in]	col_no		virtual column number
+@return maximum bytes of virtual column to be stored in the undo log */
+UNIV_INLINE
+ulint
+dict_max_v_field_len_store_undo(
+	dict_table_t*		table,
+	ulint			col_no)
+{
+	const dict_col_t*	col
+		= &dict_table_get_nth_v_col(table, col_no)->m_col;
+	ulint			max_log_len;
+
+	/* This calculation conforms to the non-virtual column
+	maximum log length calculation:
+	1) if No atomic BLOB, upto REC_ANTELOPE_MAX_INDEX_COL_LEN
+	2) if atomic BLOB, upto col->max_prefix or
+	REC_VERSION_56_MAX_INDEX_COL_LEN, whichever is less */
+	if (dict_table_has_atomic_blobs(table)) {
+		if (DATA_BIG_COL(col) && col->max_prefix > 0) {
+			max_log_len = col->max_prefix;
+		} else {
+			max_log_len = DICT_MAX_FIELD_LEN_BY_FORMAT(table);
+		}
+	} else {
+		max_log_len = REC_ANTELOPE_MAX_INDEX_COL_LEN;
+	}
+
+	return(max_log_len);
+}
+
+/** Check if the table is found is a file_per_table tablespace.
+This test does not use table flags2 since some REDUNDANT tables in the
+system tablespace may have garbage in the MIX_LEN field where flags2 is
+stored. These garbage MIX_LEN fields were written before v3.23.52.
+A patch was added to v3.23.52 which initializes the MIX_LEN field to 0.
+Since file-per-table tablespaces were added in 4.1, any SYS_TABLES
+record with a non-zero space ID will have a reliable MIX_LEN field.
+However, this test does not use flags2 from SYS_TABLES.MIX_LEN.  Instead,
+assume that if the tablespace is not a predefined system tablespace,
+ then it must be file-per-table.
+Also, during ALTER TABLE, the DICT_TF2_USE_FILE_PER_TABLE flag may not be
+set on one of the file-per-table tablespaces.
+This test cannot be done on a table in the process of being created
+because the space_id will be zero until the tablespace is created.
+@param[in]	table	An existing open table to check
+@return true if this table was created as a file-per-table tablespace. */
+UNIV_INLINE
+bool
+dict_table_is_file_per_table(
+	const dict_table_t*	table)	/*!< in: table to check */
+{
+	return table->space != fil_system.sys_space
+		&& table->space != fil_system.temp_space;
+}
+
+/** Acquire the table handle. */
+inline void dict_table_t::acquire()
+{
+  ut_ad(dict_sys.frozen());
+  n_ref_count++;
+}
+
+/** Release the table handle.
+@return	whether the last handle was released */
+inline
+bool
+dict_table_t::release()
+{
+	auto n = n_ref_count--;
+	ut_ad(n > 0);
+	return n == 1;
+}
+
+/** Encode the number of columns and number of virtual columns in a
+4 bytes value. We could do this because the number of columns in
+InnoDB is limited to 1017
+@param[in]      n_col   number of non-virtual column
+@param[in]      n_v_col number of virtual column
+@return encoded value */
+UNIV_INLINE
+ulint
+dict_table_encode_n_col(
+                ulint   n_col,
+                ulint   n_v_col)
+{
+	return(n_col + (n_v_col<<16));
+}
+
+/** decode number of virtual and non-virtual columns in one 4 bytes value.
+@param[in]      encoded encoded value
+@param[in,out]     n_col   number of non-virtual column
+@param[in,out]     n_v_col number of virtual column */
+UNIV_INLINE
+void
+dict_table_decode_n_col(
+                ulint   encoded,
+                ulint*  n_col,
+                ulint*  n_v_col)
+{
+
+	ulint	num = encoded & ~DICT_N_COLS_COMPACT;
+	*n_v_col = num >> 16;
+	*n_col = num & 0xFFFF;
+}
+
+/** Free the virtual column template
+@param[in,out]	vc_templ	virtual column template */
+void
+dict_free_vc_templ(
+	dict_vcol_templ_t*	vc_templ)
+{
+	UT_DELETE_ARRAY(vc_templ->default_rec);
+	vc_templ->default_rec = NULL;
+
+	if (vc_templ->vtempl != NULL) {
+		ut_ad(vc_templ->n_v_col > 0);
+		for (ulint i = 0; i < vc_templ->n_col
+		     + vc_templ->n_v_col; i++) {
+			if (vc_templ->vtempl[i] != NULL) {
+				ut_free(vc_templ->vtempl[i]);
+			}
+		}
+		ut_free(vc_templ->vtempl);
+		vc_templ->vtempl = NULL;
+	}
+}
+
+/** Check whether the table have virtual index.
+@param[in]	table	InnoDB table
+@return true if the table have virtual index, false otherwise. */
+UNIV_INLINE
+bool
+dict_table_have_virtual_index(
+	dict_table_t*	table)
+{
+	for (ulint col_no = 0; col_no < dict_table_get_n_v_cols(table);
+	     col_no++) {
+		const dict_v_col_t*	col
+			= dict_table_get_nth_v_col(table, col_no);
+
+		if (col->m_col.ord_part) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h
new file mode 100644
index 00000000..f7d33d5b
--- /dev/null
+++ b/storage/innobase/include/dict0load.h
@@ -0,0 +1,220 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0load.h
+Loads to the memory cache database object definitions
+from dictionary tables
+
+Created 4/24/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0load_h
+#define dict0load_h
+
+#include "dict0types.h"
+#include "trx0types.h"
+#include "ut0byte.h"
+#include "mem0mem.h"
+#include "btr0types.h"
+
+#include <deque>
+
+/** A stack of table names related through foreign key constraints */
+typedef std::deque<const char*, ut_allocator<const char*> >	dict_names_t;
+
+/** Check each tablespace found in the data dictionary.
+Then look at each table defined in SYS_TABLES that has a space_id > 0
+to find all the file-per-table tablespaces.
+
+In a crash recovery we already have some tablespace objects created from
+processing the REDO log. We will compare the
+space_id information in the data dictionary to what we find in the
+tablespace file. In addition, more validation will be done if recovery
+was needed and force_recovery is not set.
+
+We also scan the biggest space id, and store it to fil_system. */
+void dict_check_tablespaces_and_store_max_id();
+
+/** Make sure the data_file_name is saved in dict_table_t if needed.
+@param[in,out]	table		Table object */
+void dict_get_and_save_data_dir_path(dict_table_t* table);
+
+/***********************************************************************//**
+Loads a table object based on the table id.
+@return table; NULL if table does not exist */
+dict_table_t*
+dict_load_table_on_id(
+/*==================*/
+	table_id_t		table_id,	/*!< in: table id */
+	dict_err_ignore_t	ignore_err);	/*!< in: errors to ignore
+						when loading the table */
+/********************************************************************//**
+This function is called when the database is booted.
+Loads system table index definitions except for the clustered index which
+is added to the dictionary cache at booting before calling this function. */
+void
+dict_load_sys_table(
+/*================*/
+	dict_table_t*	table);	/*!< in: system table */
+/***********************************************************************//**
+Loads foreign key constraints where the table is either the foreign key
+holder or where the table is referenced by a foreign key. Adds these
+constraints to the data dictionary.
+
+The foreign key constraint is loaded only if the referenced table is also
+in the dictionary cache.  If the referenced table is not in dictionary
+cache, then it is added to the output parameter (fk_tables).
+
+@return DB_SUCCESS or error code */
+dberr_t
+dict_load_foreigns(
+/*===============*/
+	const char*		table_name,	/*!< in: table name */
+	const char**		col_names,	/*!< in: column names, or NULL
+						to use table->col_names */
+	trx_id_t		trx_id,		/*!< in: DDL transaction id,
+						or 0 to check
+						recursive load of tables
+						chained by FK */
+	bool			check_charsets,	/*!< in: whether to check
+						charset compatibility */
+	dict_err_ignore_t	ignore_err,	/*!< in: error to be ignored */
+	dict_names_t&		fk_tables)	/*!< out: stack of table names
+						which must be loaded
+						subsequently to load all the
+						foreign key constraints. */
+	MY_ATTRIBUTE((nonnull(1)));
+
+/********************************************************************//**
+This function opens a system table, and return the first record.
+@return first record of the system table */
+const rec_t*
+dict_startscan_system(
+/*==================*/
+	btr_pcur_t*	pcur,		/*!< out: persistent cursor to
+					the record */
+	mtr_t*		mtr,		/*!< in: the mini-transaction */
+	dict_table_t*	table);		/*!< in: system table */
+/********************************************************************//**
+This function get the next system table record as we scan the table.
+@return the record if found, NULL if end of scan. */
+const rec_t*
+dict_getnext_system(
+/*================*/
+	btr_pcur_t*	pcur,		/*!< in/out: persistent cursor
+					to the record */
+	mtr_t*		mtr);		/*!< in: the mini-transaction */
+
+/** Load a table definition from a SYS_TABLES record to dict_table_t.
+Do not load any columns or indexes.
+@param[in,out]	mtr		mini-transaction
+@param[in]	uncommitted	whether to use READ UNCOMMITTED isolation level
+@param[in]	rec		SYS_TABLES record
+@param[out,own]	table		table, or nullptr
+@return	error message
+@retval	nullptr on success */
+const char *dict_load_table_low(mtr_t *mtr, bool uncommitted,
+                                const rec_t *rec, dict_table_t **table)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/********************************************************************//**
+This function parses a SYS_INDEXES record and populate a dict_index_t
+structure with the information from the record. For detail information
+about SYS_INDEXES fields, please refer to dict_boot() function.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_indexes_rec(
+/*=========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_INDEXES rec */
+	dict_index_t*	index,		/*!< out: dict_index_t to be
+					filled */
+	table_id_t*	table_id);	/*!< out: table id */
+/********************************************************************//**
+This function parses a SYS_COLUMNS record and populate a dict_column_t
+structure with the information from the record.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_columns_rec(
+/*=========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_COLUMNS rec */
+	dict_col_t*	column,		/*!< out: dict_col_t to be filled */
+	table_id_t*	table_id,	/*!< out: table id */
+	const char**	col_name,	/*!< out: column name */
+	ulint*		nth_v_col);	/*!< out: if virtual col, this is
+					records its sequence number */
+
+/** This function parses a SYS_VIRTUAL record and extract virtual column
+information
+@param[in,out]	heap		heap memory
+@param[in]	rec		current SYS_COLUMNS rec
+@param[in,out]	table_id	table id
+@param[in,out]	pos		virtual column position
+@param[in,out]	base_pos	base column position
+@return error message, or NULL on success */
+const char*
+dict_process_sys_virtual_rec(
+	const rec_t*	rec,
+	table_id_t*	table_id,
+	ulint*		pos,
+	ulint*		base_pos);
+/********************************************************************//**
+This function parses a SYS_FIELDS record and populate a dict_field_t
+structure with the information from the record.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_fields_rec(
+/*========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_FIELDS rec */
+	dict_field_t*	sys_field,	/*!< out: dict_field_t to be
+					filled */
+	ulint*		pos,		/*!< out: Field position */
+	index_id_t*	index_id,	/*!< out: current index id */
+	index_id_t	last_id);	/*!< in: previous index id */
+/********************************************************************//**
+This function parses a SYS_FOREIGN record and populate a dict_foreign_t
+structure with the information from the record. For detail information
+about SYS_FOREIGN fields, please refer to dict_load_foreign() function
+@return error message, or NULL on success */
+const char*
+dict_process_sys_foreign_rec(
+/*=========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_FOREIGN rec */
+	dict_foreign_t*	foreign);	/*!< out: dict_foreign_t to be
+					filled */
+/********************************************************************//**
+This function parses a SYS_FOREIGN_COLS record and extract necessary
+information from the record and return to caller.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_foreign_col_rec(
+/*=============================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_FOREIGN_COLS rec */
+	const char**	name,		/*!< out: foreign key constraint name */
+	const char**	for_col_name,	/*!< out: referencing column name */
+	const char**	ref_col_name,	/*!< out: referenced column name
+					in referenced table */
+	ulint*		pos);		/*!< out: column position */
+
+#endif
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
new file mode 100644
index 00000000..fde2a714
--- /dev/null
+++ b/storage/innobase/include/dict0mem.h
@@ -0,0 +1,2649 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0mem.h
+Data dictionary memory object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0mem_h
+#define dict0mem_h
+
+#include "dict0types.h"
+#include "data0type.h"
+#include "mem0mem.h"
+#include "row0types.h"
+#include "btr0types.h"
+#include "lock0types.h"
+#include "que0types.h"
+#include "sux_lock.h"
+#include "ut0mem.h"
+#include "ut0rnd.h"
+#include "ut0byte.h"
+#include "hash0hash.h"
+#include "trx0types.h"
+#include "fts0fts.h"
+#include "buf0buf.h"
+#include "mtr0mtr.h"
+#include "gis0type.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "mysql_com.h"
+#include <sql_const.h>
+#include <set>
+#include <algorithm>
+#include <iterator>
+#include <ostream>
+#include <mutex>
+
+/* Forward declaration. */
+struct ib_rbt_t;
+
+/** Type flags of an index: OR'ing of the flags is allowed to define a
+combination of types */
+/* @{ */
+#define DICT_CLUSTERED	1	/*!< clustered index; for other than
+				auto-generated clustered indexes,
+				also DICT_UNIQUE will be set */
+#define DICT_UNIQUE	2	/*!< unique index */
+#define	DICT_IBUF	8	/*!< insert buffer tree */
+#define	DICT_CORRUPT	16	/*!< bit to store the corrupted flag
+				in SYS_INDEXES.TYPE */
+#define	DICT_FTS	32	/* FTS index; can't be combined with the
+				other flags */
+#define	DICT_SPATIAL	64	/* SPATIAL index; can't be combined with the
+				other flags */
+#define	DICT_VIRTUAL	128	/* Index on Virtual column */
+
+#define	DICT_IT_BITS	8	/*!< number of bits used for
+				SYS_INDEXES.TYPE */
+/* @} */
+
+#if 0 /* not implemented, retained for history */
+/** Types for a table object */
+#define DICT_TABLE_ORDINARY		1 /*!< ordinary table */
+#define	DICT_TABLE_CLUSTER_MEMBER	2
+#define	DICT_TABLE_CLUSTER		3 /* this means that the table is
+					  really a cluster definition */
+#endif
+
+/* Table and tablespace flags are generally not used for the Antelope file
+format except for the low order bit, which is used differently depending on
+where the flags are stored.
+
+==================== Low order flags bit =========================
+                    | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC
+SYS_TABLES.TYPE     |     1     |    1    |     1
+dict_table_t::flags |     0     |    1    |     1
+FSP_SPACE_FLAGS     |     0     |    0    |     1
+fil_space_t::flags  |     0     |    0    |     1
+
+Before the 5.1 plugin, SYS_TABLES.TYPE was always DICT_TABLE_ORDINARY (1)
+and the tablespace flags field was always 0. In the 5.1 plugin, these fields
+were repurposed to identify compressed and dynamic row formats.
+
+The following types and constants describe the flags found in dict_table_t
+and SYS_TABLES.TYPE.  Similar flags found in fil_space_t and FSP_SPACE_FLAGS
+are described in fsp0fsp.h. */
+
+/* @{ */
+/** dict_table_t::flags bit 0 is equal to 0 if the row format = Redundant */
+#define DICT_TF_REDUNDANT		0	/*!< Redundant row format. */
+/** dict_table_t::flags bit 0 is equal to 1 if the row format = Compact */
+#define DICT_TF_COMPACT			1U	/*!< Compact row format. */
+
+/** This bitmask is used in SYS_TABLES.N_COLS to set and test whether
+the Compact page format is used, i.e ROW_FORMAT != REDUNDANT */
+constexpr uint32_t DICT_N_COLS_COMPACT= 1U << 31;
+
+/** Width of the COMPACT flag */
+#define DICT_TF_WIDTH_COMPACT		1
+
+/** Width of the ZIP_SSIZE flag */
+#define DICT_TF_WIDTH_ZIP_SSIZE		4
+
+/** Width of the ATOMIC_BLOBS flag.  The ROW_FORMAT=REDUNDANT and
+ROW_FORMAT=COMPACT broke up BLOB and TEXT fields, storing the first 768 bytes
+in the clustered index. ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPRESSED
+store the whole blob or text field off-page atomically.
+Secondary indexes are created from this external data using row_ext_t
+to cache the BLOB prefixes. */
+#define DICT_TF_WIDTH_ATOMIC_BLOBS	1
+
+/** If a table is created with the MYSQL option DATA DIRECTORY and
+innodb-file-per-table, an older engine will not be able to find that table.
+This flag prevents older engines from attempting to open the table and
+allows InnoDB to update_create_info() accordingly. */
+#define DICT_TF_WIDTH_DATA_DIR		1
+
+/**
+Width of the page compression flag
+*/
+#define DICT_TF_WIDTH_PAGE_COMPRESSION  1
+#define DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL 4
+
+/**
+The NO_ROLLBACK flag (3=yes; the values 1,2 used stand for
+ATOMIC_WRITES=ON and ATOMIC_WRITES=OFF between MariaDB 10.1.0 and 10.2.3)
+*/
+#define DICT_TF_WIDTH_NO_ROLLBACK 2
+
+/** Width of all the currently known table flags */
+#define DICT_TF_BITS	(DICT_TF_WIDTH_COMPACT			\
+			+ DICT_TF_WIDTH_ZIP_SSIZE		\
+			+ DICT_TF_WIDTH_ATOMIC_BLOBS		\
+			+ DICT_TF_WIDTH_DATA_DIR		\
+			+ DICT_TF_WIDTH_PAGE_COMPRESSION	\
+			+ DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL	\
+			+ DICT_TF_WIDTH_NO_ROLLBACK)
+
+/** Zero relative shift position of the COMPACT field */
+#define DICT_TF_POS_COMPACT		0
+/** Zero relative shift position of the ZIP_SSIZE field */
+#define DICT_TF_POS_ZIP_SSIZE		(DICT_TF_POS_COMPACT		\
+					+ DICT_TF_WIDTH_COMPACT)
+/** Zero relative shift position of the ATOMIC_BLOBS field */
+#define DICT_TF_POS_ATOMIC_BLOBS	(DICT_TF_POS_ZIP_SSIZE		\
+					+ DICT_TF_WIDTH_ZIP_SSIZE)
+/** Zero relative shift position of the DATA_DIR field */
+#define DICT_TF_POS_DATA_DIR		(DICT_TF_POS_ATOMIC_BLOBS	\
+					+ DICT_TF_WIDTH_ATOMIC_BLOBS)
+/** Zero relative shift position of the PAGE_COMPRESSION field */
+#define DICT_TF_POS_PAGE_COMPRESSION	(DICT_TF_POS_DATA_DIR		\
+					+ DICT_TF_WIDTH_DATA_DIR)
+/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */
+#define DICT_TF_POS_PAGE_COMPRESSION_LEVEL	(DICT_TF_POS_PAGE_COMPRESSION	\
+					+ DICT_TF_WIDTH_PAGE_COMPRESSION)
+/** Zero relative shift position of the NO_ROLLBACK field */
+#define DICT_TF_POS_NO_ROLLBACK		(DICT_TF_POS_PAGE_COMPRESSION_LEVEL \
+					+ DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)
+#define DICT_TF_POS_UNUSED		(DICT_TF_POS_NO_ROLLBACK     \
+					+ DICT_TF_WIDTH_NO_ROLLBACK)
+
+/** Bit mask of the COMPACT field */
+#define DICT_TF_MASK_COMPACT				\
+		((~(~0U << DICT_TF_WIDTH_COMPACT))	\
+		<< DICT_TF_POS_COMPACT)
+/** Bit mask of the ZIP_SSIZE field */
+#define DICT_TF_MASK_ZIP_SSIZE				\
+		((~(~0U << DICT_TF_WIDTH_ZIP_SSIZE))	\
+		<< DICT_TF_POS_ZIP_SSIZE)
+/** Bit mask of the ATOMIC_BLOBS field */
+#define DICT_TF_MASK_ATOMIC_BLOBS			\
+		((~(~0U << DICT_TF_WIDTH_ATOMIC_BLOBS))	\
+		<< DICT_TF_POS_ATOMIC_BLOBS)
+/** Bit mask of the DATA_DIR field */
+#define DICT_TF_MASK_DATA_DIR				\
+		((~(~0U << DICT_TF_WIDTH_DATA_DIR))	\
+		<< DICT_TF_POS_DATA_DIR)
+/** Bit mask of the PAGE_COMPRESSION field */
+#define DICT_TF_MASK_PAGE_COMPRESSION			\
+		((~(~0U << DICT_TF_WIDTH_PAGE_COMPRESSION)) \
+		<< DICT_TF_POS_PAGE_COMPRESSION)
+/** Bit mask of the PAGE_COMPRESSION_LEVEL field */
+#define DICT_TF_MASK_PAGE_COMPRESSION_LEVEL		\
+		((~(~0U << DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)) \
+		<< DICT_TF_POS_PAGE_COMPRESSION_LEVEL)
+/** Bit mask of the NO_ROLLBACK field */
+#define DICT_TF_MASK_NO_ROLLBACK		\
+		((~(~0U << DICT_TF_WIDTH_NO_ROLLBACK)) \
+		<< DICT_TF_POS_NO_ROLLBACK)
+
+/** Return the value of the COMPACT field */
+#define DICT_TF_GET_COMPACT(flags)			\
+		((flags & DICT_TF_MASK_COMPACT)		\
+		>> DICT_TF_POS_COMPACT)
+/** Return the value of the ZIP_SSIZE field */
+#define DICT_TF_GET_ZIP_SSIZE(flags)			\
+		((flags & DICT_TF_MASK_ZIP_SSIZE)	\
+		>> DICT_TF_POS_ZIP_SSIZE)
+/** Return the value of the ATOMIC_BLOBS field */
+#define DICT_TF_HAS_ATOMIC_BLOBS(flags)			\
+		((flags & DICT_TF_MASK_ATOMIC_BLOBS)	\
+		>> DICT_TF_POS_ATOMIC_BLOBS)
+/** Return the value of the DATA_DIR field */
+#define DICT_TF_HAS_DATA_DIR(flags)			\
+		((flags & DICT_TF_MASK_DATA_DIR)	\
+		>> DICT_TF_POS_DATA_DIR)
+/** Return the value of the PAGE_COMPRESSION field */
+#define DICT_TF_GET_PAGE_COMPRESSION(flags)	       \
+		((flags & DICT_TF_MASK_PAGE_COMPRESSION) \
+		>> DICT_TF_POS_PAGE_COMPRESSION)
+/** Return the value of the PAGE_COMPRESSION_LEVEL field */
+#define DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags)       \
+		((flags & DICT_TF_MASK_PAGE_COMPRESSION_LEVEL)	\
+		>> DICT_TF_POS_PAGE_COMPRESSION_LEVEL)
+
+/* @} */
+
+/** @brief Table Flags set number 2.
+
+These flags will be stored in SYS_TABLES.MIX_LEN.  All unused flags
+will be written as 0.  The column may contain garbage for tables
+created with old versions of InnoDB that only implemented
+ROW_FORMAT=REDUNDANT.  InnoDB engines do not check these flags
+for unknown bits in order to protect backward incompatibility. */
+/* @{ */
+/** Total number of bits in table->flags2. */
+#define DICT_TF2_BITS			7
+#define DICT_TF2_UNUSED_BIT_MASK	(~0U << DICT_TF2_BITS)
+#define DICT_TF2_BIT_MASK		~DICT_TF2_UNUSED_BIT_MASK
+
+/** TEMPORARY; TRUE for tables from CREATE TEMPORARY TABLE. */
+#define DICT_TF2_TEMPORARY		1U
+
+/** The table has an internal defined DOC ID column */
+#define DICT_TF2_FTS_HAS_DOC_ID		2U
+
+/** The table has an FTS index */
+#define DICT_TF2_FTS			4U
+
+/** Need to add Doc ID column for FTS index build.
+This is a transient bit for index build */
+#define DICT_TF2_FTS_ADD_DOC_ID		8U
+
+/** This bit is used during table creation to indicate that it will
+use its own tablespace instead of the system tablespace. */
+#define DICT_TF2_USE_FILE_PER_TABLE	16U
+
+/** Set when we discard/detach the tablespace */
+#define DICT_TF2_DISCARDED		32U
+
+/** This bit is set if all aux table names (both common tables and
+index tables) of a FTS table are in HEX format. */
+#define DICT_TF2_FTS_AUX_HEX_NAME	64U
+
+/* @} */
+
+#define DICT_TF2_FLAG_SET(table, flag)		\
+	(table->flags2 |= (flag))
+
+#define DICT_TF2_FLAG_IS_SET(table, flag)	\
+	(table->flags2 & (flag))
+
+#define DICT_TF2_FLAG_UNSET(table, flag)	\
+	(table->flags2 &= ~(flag) & ((1U << DICT_TF2_BITS) - 1))
+
+/** Tables could be chained together with Foreign key constraint. When
+first load the parent table, we would load all of its descedents.
+This could result in rescursive calls and out of stack error eventually.
+DICT_FK_MAX_RECURSIVE_LOAD defines the maximum number of recursive loads,
+when exceeded, the child table will not be loaded. It will be loaded when
+the foreign constraint check needs to be run. */
+#define DICT_FK_MAX_RECURSIVE_LOAD	20
+
+/** Similarly, when tables are chained together with foreign key constraints
+with on cascading delete/update clause, delete from parent table could
+result in recursive cascading calls. This defines the maximum number of
+such cascading deletes/updates allowed. When exceeded, the delete from
+parent table will fail, and user has to drop excessive foreign constraint
+before proceeds. */
+#define FK_MAX_CASCADE_DEL		15
+
+/****************************************************************/ /**
+ Free a table memory object. */
+void
+dict_mem_table_free(
+/*================*/
+	dict_table_t*	table);		/*!< in: table */
+/**********************************************************************//**
+Adds a column definition to a table. */
+void
+dict_mem_table_add_col(
+/*===================*/
+	dict_table_t*	table,	/*!< in: table */
+	mem_heap_t*	heap,	/*!< in: temporary memory heap, or NULL */
+	const char*	name,	/*!< in: column name, or NULL */
+	ulint		mtype,	/*!< in: main datatype */
+	ulint		prtype,	/*!< in: precise type */
+	ulint		len)	/*!< in: precision */
+	MY_ATTRIBUTE((nonnull(1)));
+/** Adds a virtual column definition to a table.
+@param[in,out]	table		table
+@param[in]	heap		temporary memory heap, or NULL. It is
+				used to store name when we have not finished
+				adding all columns. When all columns are
+				added, the whole name will copy to memory from
+				table->heap
+@param[in]	name		column name
+@param[in]	mtype		main datatype
+@param[in]	prtype		precise type
+@param[in]	len		length
+@param[in]	pos		position in a table
+@param[in]	num_base	number of base columns
+@return the virtual column definition */
+dict_v_col_t*
+dict_mem_table_add_v_col(
+	dict_table_t*	table,
+	mem_heap_t*	heap,
+	const char*	name,
+	ulint		mtype,
+	ulint		prtype,
+	ulint		len,
+	ulint		pos,
+	ulint		num_base);
+
+/** Adds a stored column definition to a table.
+@param[in]	table		table
+@param[in]	num_base	number of base columns. */
+void
+dict_mem_table_add_s_col(
+	dict_table_t*	table,
+	ulint		num_base);
+
+/**********************************************************************//**
+Renames a column of a table in the data dictionary cache. */
+void
+dict_mem_table_col_rename(
+/*======================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	ulint		nth_col,/*!< in: column index */
+	const char*	from,	/*!< in: old column name */
+	const char*	to,	/*!< in: new column name */
+	bool		is_virtual);
+				/*!< in: if this is a virtual column */
+/**********************************************************************//**
+This function populates a dict_col_t memory structure with
+supplied information. */
+void
+dict_mem_fill_column_struct(
+/*========================*/
+	dict_col_t*	column,		/*!< out: column struct to be
+					filled */
+	ulint		col_pos,	/*!< in: column position */
+	ulint		mtype,		/*!< in: main data type */
+	ulint		prtype,		/*!< in: precise type */
+	ulint		col_len);	/*!< in: column length */
+/**********************************************************************//**
+This function poplulates a dict_index_t index memory structure with
+supplied information. */
+UNIV_INLINE
+void
+dict_mem_fill_index_struct(
+/*=======================*/
+	dict_index_t*	index,		/*!< out: index to be filled */
+	mem_heap_t*	heap,		/*!< in: memory heap */
+	const char*	index_name,	/*!< in: index name */
+	ulint		type,		/*!< in: DICT_UNIQUE,
+					DICT_CLUSTERED, ... ORed */
+	ulint		n_fields);	/*!< in: number of fields */
+/**********************************************************************//**
+Creates an index memory object.
+@return own: index object */
+dict_index_t*
+dict_mem_index_create(
+/*==================*/
+	dict_table_t*	table,		/*!< in: table */
+	const char*	index_name,	/*!< in: index name */
+	ulint		type,		/*!< in: DICT_UNIQUE,
+					DICT_CLUSTERED, ... ORed */
+	ulint		n_fields);	/*!< in: number of fields */
+
+/**********************************************************************//**
+Frees an index memory object. */
+void
+dict_mem_index_free(
+/*================*/
+	dict_index_t*	index);	/*!< in: index */
+/**********************************************************************//**
+Creates and initializes a foreign constraint memory object.
+@return own: foreign constraint struct */
+dict_foreign_t*
+dict_mem_foreign_create(void);
+/*=========================*/
+
+/**********************************************************************//**
+Sets the foreign_table_name_lookup pointer based on the value of
+lower_case_table_names.  If that is 0 or 1, foreign_table_name_lookup
+will point to foreign_table_name.  If 2, then another string is
+allocated from the heap and set to lower case. */
+void
+dict_mem_foreign_table_name_lookup_set(
+/*===================================*/
+	dict_foreign_t*	foreign,	/*!< in/out: foreign struct */
+	ibool		do_alloc);	/*!< in: is an alloc needed */
+
+/**********************************************************************//**
+Sets the referenced_table_name_lookup pointer based on the value of
+lower_case_table_names.  If that is 0 or 1, referenced_table_name_lookup
+will point to referenced_table_name.  If 2, then another string is
+allocated from the heap and set to lower case. */
+void
+dict_mem_referenced_table_name_lookup_set(
+/*======================================*/
+	dict_foreign_t*	foreign,	/*!< in/out: foreign struct */
+	ibool		do_alloc);	/*!< in: is an alloc needed */
+
+/** Fills the dependent virtual columns in a set.
+Reason for being dependent are
+1) FK can be present on base column of virtual columns
+2) FK can be present on column which is a part of virtual index
+@param[in,out] foreign foreign key information. */
+void
+dict_mem_foreign_fill_vcol_set(
+       dict_foreign_t*	foreign);
+
+/** Fill virtual columns set in each fk constraint present in the table.
+@param[in,out] table   innodb table object. */
+void
+dict_mem_table_fill_foreign_vcol_set(
+        dict_table_t*	table);
+
+/** Free the vcol_set from all foreign key constraint on the table.
+@param[in,out] table   innodb table object. */
+void
+dict_mem_table_free_foreign_vcol_set(
+	dict_table_t*	table);
+
+/** Create a temporary tablename like "#sql-ibNNN".
+@param[in]	heap	A memory heap
+@param[in]	dbtab	Table name in the form database/table name
+@param[in]	id	Table id
+@return A unique temporary tablename suitable for InnoDB use */
+char*
+dict_mem_create_temporary_tablename(
+	mem_heap_t*	heap,
+	const char*	dbtab,
+	table_id_t	id);
+
+/** SQL identifier name wrapper for pretty-printing */
+class id_name_t
+{
+public:
+	/** Default constructor */
+	id_name_t()
+		: m_name()
+	{}
+	/** Constructor
+	@param[in]	name	identifier to assign */
+	explicit id_name_t(
+		const char*	name)
+		: m_name(name)
+	{}
+
+	/** Assignment operator
+	@param[in]	name	identifier to assign */
+	id_name_t& operator=(
+		const char*	name)
+	{
+		m_name = name;
+		return(*this);
+	}
+
+	/** Implicit type conversion
+	@return the name */
+	operator const char*() const
+	{
+		return(m_name);
+	}
+
+	/** Explicit type conversion
+	@return the name */
+	const char* operator()() const
+	{
+		return(m_name);
+	}
+
+private:
+	/** The name in internal representation */
+	const char*	m_name;
+};
+
+/** Data structure for a column in a table */
+struct dict_col_t{
+	/*----------------------*/
+	/** The following are copied from dtype_t,
+	so that all bit-fields can be packed tightly. */
+	/* @{ */
+	unsigned	prtype:32;	/*!< precise type; MySQL data
+					type, charset code, flags to
+					indicate nullability,
+					signedness, whether this is a
+					binary string, whether this is
+					a true VARCHAR where MySQL
+					uses 2 bytes to store the length */
+	unsigned	mtype:8;	/*!< main data type */
+
+	/* the remaining fields do not affect alphabetical ordering: */
+
+	unsigned	len:16;		/*!< length; for MySQL data this
+					is field->pack_length(),
+					except that for a >= 5.0.3
+					type true VARCHAR this is the
+					maximum byte length of the
+					string data (in addition to
+					the string, MySQL uses 1 or 2
+					bytes to store the string length) */
+
+	unsigned	mbminlen:3;	/*!< minimum length of a
+					character, in bytes */
+	unsigned	mbmaxlen:3;	/*!< maximum length of a
+					character, in bytes */
+	/*----------------------*/
+	/* End of definitions copied from dtype_t */
+	/* @} */
+
+	unsigned	ind:10;		/*!< table column position
+					(starting from 0) */
+	unsigned	ord_part:1;	/*!< nonzero if this column
+					appears in the ordering fields
+					of an index */
+	unsigned	max_prefix:12;	/*!< maximum index prefix length on
+					this column. Our current max limit is
+					3072 (REC_VERSION_56_MAX_INDEX_COL_LEN)
+					bytes. */
+private:
+	/** Special value of ind for a dropped column */
+	static const unsigned DROPPED = 1023;
+public:
+
+  /** Detach a virtual column from an index.
+  @param index  being-freed index */
+  inline void detach(const dict_index_t &index);
+
+  /** Data for instantly added columns */
+  struct def_t
+  {
+    /** original default value of instantly added column */
+    const void *data;
+    /** len of data, or UNIV_SQL_DEFAULT if unavailable */
+    ulint len;
+  } def_val;
+
+  /** Retrieve the column name.
+  @param table  the table of this column */
+  const char *name(const dict_table_t &table) const;
+
+  /** @return whether this is a virtual column */
+  bool is_virtual() const { return prtype & DATA_VIRTUAL; }
+  /** @return whether NULL is an allowed value for this column */
+  bool is_nullable() const { return !(prtype & DATA_NOT_NULL); }
+
+  /** @return whether table of this system field is TRX_ID-based */
+  bool vers_native() const
+  {
+    ut_ad(vers_sys_start() || vers_sys_end());
+    ut_ad(mtype == DATA_INT || mtype == DATA_FIXBINARY);
+    return mtype == DATA_INT;
+  }
+  /** @return whether this user column (not row_start, row_end)
+  has System Versioning property */
+  bool is_versioned() const { return !(~prtype & DATA_VERSIONED); }
+  /** @return whether this is the system version start */
+  bool vers_sys_start() const
+  {
+    return (prtype & DATA_VERSIONED) == DATA_VERS_START;
+  }
+  /** @return whether this is the system version end */
+  bool vers_sys_end() const
+  {
+    return (prtype & DATA_VERSIONED) == DATA_VERS_END;
+  }
+
+  /** @return whether this is an instantly-added column */
+  bool is_added() const
+  {
+    DBUG_ASSERT(def_val.len != UNIV_SQL_DEFAULT || !def_val.data);
+    return def_val.len != UNIV_SQL_DEFAULT;
+  }
+  /** Flag the column instantly dropped */
+  void set_dropped() { ind = DROPPED; }
+  /** Flag the column instantly dropped.
+  @param not_null  whether the column was NOT NULL
+  @param len2      whether the length exceeds 255 bytes
+  @param fixed_len the fixed length in bytes, or 0 */
+  void set_dropped(bool not_null, bool len2, unsigned fixed)
+  {
+    DBUG_ASSERT(!len2 || !fixed);
+    prtype= not_null ? DATA_NOT_NULL | DATA_BINARY_TYPE : DATA_BINARY_TYPE;
+    if (fixed)
+    {
+      mtype= DATA_FIXBINARY;
+      len= static_cast<uint16_t>(fixed);
+    }
+    else
+    {
+      mtype= DATA_BINARY;
+      len= len2 ? 65535 : 255;
+    }
+    mbminlen= mbmaxlen= 0;
+    ind= DROPPED;
+    ord_part= 0;
+    max_prefix= 0;
+  }
+  /** @return whether the column was instantly dropped */
+  bool is_dropped() const { return ind == DROPPED; }
+  /** @return whether the column was instantly dropped
+  @param index  the clustered index */
+  inline bool is_dropped(const dict_index_t &index) const;
+
+  /** Get the default value of an instantly-added column.
+  @param[out] len   value length (in bytes), or UNIV_SQL_NULL
+  @return default value
+  @retval NULL if the default value is SQL NULL (len=UNIV_SQL_NULL) */
+  const byte *instant_value(ulint *len) const
+  {
+    DBUG_ASSERT(is_added());
+    *len= def_val.len;
+    return static_cast<const byte*>(def_val.data);
+  }
+
+  /** Remove the 'instant ADD' status of the column */
+  void clear_instant()
+  {
+    def_val.len= UNIV_SQL_DEFAULT;
+    def_val.data= NULL;
+  }
+
+  /** @return whether two columns have compatible data type encoding */
+  bool same_type(const dict_col_t &other) const
+  {
+    if (mtype != other.mtype)
+    {
+      /* For latin1_swedish_ci, DATA_CHAR and DATA_VARCHAR
+      will be used instead of DATA_MYSQL and DATA_VARMYSQL.
+      As long as mtype,prtype are being written to InnoDB
+      data dictionary tables, we cannot simplify this. */
+      switch (mtype) {
+      default:
+        return false;
+      case DATA_VARCHAR:
+        if (other.mtype != DATA_VARMYSQL)
+          return false;
+        goto check_encoding;
+      case DATA_VARMYSQL:
+        if (other.mtype != DATA_VARCHAR)
+          return false;
+        goto check_encoding;
+      case DATA_CHAR:
+        if (other.mtype != DATA_MYSQL)
+          return false;
+        goto check_encoding;
+      case DATA_MYSQL:
+        if (other.mtype != DATA_CHAR)
+          return false;
+        goto check_encoding;
+      }
+    }
+    else if (dtype_is_string_type(mtype))
+    {
+    check_encoding:
+      const uint16_t cset= dtype_get_charset_coll(prtype);
+      const uint16_t ocset= dtype_get_charset_coll(other.prtype);
+      return cset == ocset || dict_col_t::same_encoding(cset, ocset);
+    }
+
+    return true;
+  }
+
+  /** @return whether two collations codes have the same character encoding */
+  static bool same_encoding(uint16_t a, uint16_t b);
+
+  /** Determine if the columns have the same format
+  except for is_nullable() and is_versioned().
+  @param other   column to compare to
+  @return whether the columns have the same format */
+  bool same_format(const dict_col_t &other) const
+  {
+    return same_type(other) && len >= other.len &&
+      mbminlen == other.mbminlen && mbmaxlen >= other.mbmaxlen &&
+      !((prtype ^ other.prtype) & ~(DATA_NOT_NULL | DATA_VERSIONED |
+                                    CHAR_COLL_MASK << 16 |
+                                    DATA_LONG_TRUE_VARCHAR));
+  }
+
+  /** @return whether the column values are comparable by memcmp() */
+  bool is_binary() const { return prtype & DATA_BINARY_TYPE; }
+};
+
+/** Index information put in a list of virtual column structure. Index
+id and virtual column position in the index will be logged.
+There can be multiple entries for a given index, with a different position. */
+struct dict_v_idx_t {
+	/** active index on the column */
+	dict_index_t*	index;
+
+	/** position in this index */
+	ulint		nth_field;
+
+	dict_v_idx_t(dict_index_t* index, ulint nth_field)
+		: index(index), nth_field(nth_field) {}
+};
+
+/** Data structure for a virtual column in a table */
+struct dict_v_col_t{
+	/** column structure */
+	dict_col_t		m_col;
+
+	/** array of base column ptr */
+	dict_col_t**		base_col;
+
+	/** number of base column */
+	unsigned		num_base:10;
+
+	/** column pos in table */
+	unsigned		v_pos:10;
+
+	/** Virtual index list, and column position in the index */
+	std::forward_list<dict_v_idx_t, ut_allocator<dict_v_idx_t> >
+	v_indexes;
+
+  /** Detach the column from an index.
+  @param index  index to be detached from */
+  void detach(const dict_index_t &index)
+  {
+    if (v_indexes.empty()) return;
+    auto i= v_indexes.before_begin();
+    do {
+      auto prev = i++;
+      if (i == v_indexes.end())
+      {
+        return;
+      }
+      if (i->index == &index)
+      {
+        v_indexes.erase_after(prev);
+        return;
+      }
+    }
+    while (i != v_indexes.end());
+  }
+};
+
+/** Data structure for newly added virtual column in a index.
+It is used only during rollback_inplace_alter_table() of
+addition of index depending on newly added virtual columns
+and uses index heap. Should be freed when index is being
+removed from cache. */
+struct dict_add_v_col_info
+{
+  ulint n_v_col;
+  dict_v_col_t *v_col;
+
+  /** Add the newly added virtual column while rollbacking
+  the index which contains new virtual columns
+  @param col    virtual column to be duplicated
+  @param offset offset where to duplicate virtual column */
+  dict_v_col_t* add_drop_v_col(mem_heap_t *heap, dict_v_col_t *col,
+                               ulint offset)
+  {
+    ut_ad(n_v_col);
+    ut_ad(offset < n_v_col);
+    if (!v_col)
+      v_col= static_cast<dict_v_col_t*>
+        (mem_heap_alloc(heap, n_v_col * sizeof *v_col));
+    new (&v_col[offset]) dict_v_col_t();
+    v_col[offset].m_col= col->m_col;
+    v_col[offset].v_pos= col->v_pos;
+    return &v_col[offset];
+  }
+};
+
+/** Data structure for newly added virtual column in a table */
+struct dict_add_v_col_t{
+	/** number of new virtual column */
+	ulint			n_v_col;
+
+	/** column structures */
+	const dict_v_col_t*	v_col;
+
+	/** new col names */
+	const char**		v_col_name;
+};
+
+/** Data structure for a stored column in a table. */
+struct dict_s_col_t {
+	/** Stored column ptr */
+	dict_col_t*	m_col;
+	/** array of base col ptr */
+	dict_col_t**	base_col;
+	/** number of base columns */
+	ulint		num_base;
+	/** column pos in table */
+	ulint		s_pos;
+};
+
+/** list to put stored column for create_table_info_t */
+typedef std::forward_list<dict_s_col_t, ut_allocator<dict_s_col_t> >
+dict_s_col_list;
+
+/** @brief DICT_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and
+is the maximum indexed column length (or indexed prefix length) in
+ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT. Also, in any format,
+any fixed-length field that is longer than this will be encoded as
+a variable-length field.
+
+It is set to 3*256, so that one can create a column prefix index on
+256 characters of a TEXT or VARCHAR column also in the UTF-8
+charset. In that charset, a character may take at most 3 bytes.  This
+constant MUST NOT BE CHANGED, or the compatibility of InnoDB data
+files would be at risk! */
+#define DICT_ANTELOPE_MAX_INDEX_COL_LEN	REC_ANTELOPE_MAX_INDEX_COL_LEN
+
+/** Find out maximum indexed column length by its table format.
+For ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT, the maximum
+field length is REC_ANTELOPE_MAX_INDEX_COL_LEN - 1 (767). For
+ROW_FORMAT=COMPRESSED and ROW_FORMAT=DYNAMIC, the length could
+be REC_VERSION_56_MAX_INDEX_COL_LEN (3072) bytes */
+#define DICT_MAX_FIELD_LEN_BY_FORMAT(table)	\
+	(dict_table_has_atomic_blobs(table)	\
+	 ? REC_VERSION_56_MAX_INDEX_COL_LEN	\
+	 : REC_ANTELOPE_MAX_INDEX_COL_LEN - 1)
+
+#define DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags)	\
+	(DICT_TF_HAS_ATOMIC_BLOBS(flags)		\
+	 ? REC_VERSION_56_MAX_INDEX_COL_LEN		\
+	 : REC_ANTELOPE_MAX_INDEX_COL_LEN - 1)
+
+/** Defines the maximum fixed length column size */
+#define DICT_MAX_FIXED_COL_LEN		DICT_ANTELOPE_MAX_INDEX_COL_LEN
+
+#ifdef WITH_WSREP
+#define WSREP_MAX_SUPPORTED_KEY_LENGTH 3500
+#endif /* WITH_WSREP */
+
+/** Data structure for a field in an index */
+struct dict_field_t{
+	dict_col_t*	col;		/*!< pointer to the table column */
+	id_name_t	name;		/*!< name of the column */
+	unsigned	prefix_len:12;	/*!< 0 or the length of the column
+					prefix in bytes in a MySQL index of
+					type, e.g., INDEX (textcol(25));
+					must be smaller than
+					DICT_MAX_FIELD_LEN_BY_FORMAT;
+					NOTE that in the UTF-8 charset, MySQL
+					sets this to (mbmaxlen * the prefix len)
+					in UTF-8 chars */
+	unsigned	fixed_len:10;	/*!< 0 or the fixed length of the
+					column if smaller than
+					DICT_ANTELOPE_MAX_INDEX_COL_LEN */
+	/** 1=DESC, 0=ASC */
+	unsigned	descending:1;
+
+	/** Zero-initialize all fields */
+	dict_field_t() { memset((void*) this, 0, sizeof *this); }
+
+	/** Check whether two index fields are equivalent.
+	@param[in]	old	the other index field
+	@return	whether the index fields are equivalent */
+	bool same(const dict_field_t& other) const
+	{
+		return(prefix_len == other.prefix_len
+		       && fixed_len == other.fixed_len);
+	}
+};
+
+/**********************************************************************//**
+PADDING HEURISTIC BASED ON LINEAR INCREASE OF PADDING TO AVOID
+COMPRESSION FAILURES
+(Note: this is relevant only for compressed indexes)
+GOAL: Avoid compression failures by maintaining information about the
+compressibility of data. If data is not very compressible then leave
+some extra space 'padding' in the uncompressed page making it more
+likely that compression of less than fully packed uncompressed page will
+succeed.
+
+This padding heuristic works by increasing the pad linearly until the
+desired failure rate is reached. A "round" is a fixed number of
+compression operations.
+After each round, the compression failure rate for that round is
+computed. If the failure rate is too high, then padding is incremented
+by a fixed value, otherwise it's left intact.
+If the compression failure is lower than the desired rate for a fixed
+number of consecutive rounds, then the padding is decreased by a fixed
+value. This is done to prevent overshooting the padding value,
+and to accommodate the possible change in data compressibility. */
+
+/** Number of zip ops in one round. */
+#define ZIP_PAD_ROUND_LEN			(128)
+
+/** Number of successful rounds after which the padding is decreased */
+#define ZIP_PAD_SUCCESSFUL_ROUND_LIMIT		(5)
+
+/** Amount by which padding is increased. */
+#define ZIP_PAD_INCR				(128)
+
+/** Percentage of compression failures that are allowed in a single
+round */
+extern ulong	zip_failure_threshold_pct;
+
+/** Maximum percentage of a page that can be allowed as a pad to avoid
+compression failures */
+extern ulong	zip_pad_max;
+
+/** Data structure to hold information about about how much space in
+an uncompressed page should be left as padding to avoid compression
+failures. This estimate is based on a self-adapting heuristic. */
+struct zip_pad_info_t {
+  /** Dummy assignment operator for dict_index_t::clone() */
+  zip_pad_info_t &operator=(const zip_pad_info_t&) { return *this; }
+	std::mutex	mutex;	/*!< mutex protecting the info */
+	Atomic_relaxed<ulint>
+			pad;	/*!< number of bytes used as pad */
+	ulint		success;/*!< successful compression ops during
+				current round */
+	ulint		failure;/*!< failed compression ops during
+				current round */
+	ulint		n_rounds;/*!< number of currently successful
+				rounds */
+};
+
+/** Number of samples of data size kept when page compression fails for
+a certain index.*/
+#define STAT_DEFRAG_DATA_SIZE_N_SAMPLE	10
+
+/** "GEN_CLUST_INDEX" is the name reserved for InnoDB default
+system clustered index when there is no primary key. */
+const char innobase_index_reserve_name[] = "GEN_CLUST_INDEX";
+
+/** Data structure for an index.  Most fields will be
+initialized to 0, NULL or FALSE in dict_mem_index_create(). */
+struct dict_index_t {
+  /** Columns whose character-set collation is being changed */
+  struct col_info
+  {
+    /** number of columns whose charset-collation is being changed */
+    unsigned n_cols;
+    /** columns with changed charset-collation */
+    dict_col_t *cols;
+
+    /** Add a column with changed collation. */
+    dict_col_t *add(mem_heap_t *heap, const dict_col_t &col, unsigned offset)
+    {
+      ut_ad(offset < n_cols);
+      if (!cols)
+        cols= static_cast<dict_col_t*>
+          (mem_heap_alloc(heap, n_cols * sizeof col));
+      new (&cols[offset]) dict_col_t(col);
+      return &cols[offset];
+    }
+  };
+
+  /** Maximum number of fields */
+  static constexpr unsigned MAX_N_FIELDS= (1U << 10) - 1;
+
+	index_id_t	id;	/*!< id of the index */
+	mem_heap_t*	heap;	/*!< memory heap */
+	id_name_t	name;	/*!< index name */
+	dict_table_t*	table;	/*!< back pointer to table */
+	/** root page number, or FIL_NULL if the index has been detached
+	from storage (DISCARD TABLESPACE or similar),
+	or 1 if the index is in table->freed_indexes */
+	unsigned	page:32;
+	unsigned	merge_threshold:6;
+				/*!< In the pessimistic delete, if the page
+				data size drops below this limit in percent,
+				merging it to a neighbor is tried */
+# define DICT_INDEX_MERGE_THRESHOLD_DEFAULT 50
+	unsigned	type:DICT_IT_BITS;
+				/*!< index type (DICT_CLUSTERED, DICT_UNIQUE,
+				DICT_IBUF, DICT_CORRUPT) */
+#define MAX_KEY_LENGTH_BITS 12
+	unsigned	trx_id_offset:MAX_KEY_LENGTH_BITS;
+				/*!< position of the trx id column
+				in a clustered index record, if the fields
+				before it are known to be of a fixed size,
+				0 otherwise */
+#if (1<<MAX_KEY_LENGTH_BITS) < HA_MAX_KEY_LENGTH
+# error (1<<MAX_KEY_LENGTH_BITS) < HA_MAX_KEY_LENGTH
+#endif
+	unsigned	n_user_defined_cols:10;
+				/*!< number of columns the user defined to
+				be in the index: in the internal
+				representation we add more columns */
+	unsigned	nulls_equal:1;
+				/*!< if true, SQL NULL == SQL NULL */
+	unsigned	n_uniq:10;/*!< number of fields from the beginning
+				which are enough to determine an index
+				entry uniquely */
+	unsigned	n_def:10;/*!< number of fields defined so far */
+	unsigned	n_fields:10;/*!< number of fields in the index */
+	unsigned	n_nullable:10;/*!< number of nullable fields */
+	unsigned	n_core_fields:10;/*!< number of fields in the index
+				(before the first time of instant add columns) */
+	/** number of bytes of null bits in ROW_FORMAT!=REDUNDANT node pointer
+	records; usually equal to UT_BITS_IN_BYTES(n_nullable), but
+	can be less in clustered indexes with instant ADD COLUMN */
+	unsigned	n_core_null_bytes:8;
+	/** magic value signalling that n_core_null_bytes was not
+	initialized yet */
+	static const unsigned NO_CORE_NULL_BYTES = 0xff;
+	/** The clustered index ID of the hard-coded SYS_INDEXES table. */
+	static const unsigned DICT_INDEXES_ID = 3;
+	unsigned	cached:1;/*!< TRUE if the index object is in the
+				dictionary cache */
+	unsigned	to_be_dropped:1;
+				/*!< TRUE if the index is to be dropped;
+				protected by dict_sys.latch */
+	unsigned	online_status:2;
+				/*!< enum online_index_status.
+				Transitions from ONLINE_INDEX_COMPLETE (to
+				ONLINE_INDEX_CREATION) are protected
+				by dict_sys.latch. Other changes are
+				protected by index->lock. */
+	unsigned	uncommitted:1;
+				/*!< a flag that is set for secondary indexes
+				that have not been committed to the
+				data dictionary yet. Protected by
+				MDL */
+
+#ifdef UNIV_DEBUG
+	/** whether this is a dummy index object */
+	bool		is_dummy;
+	/** whether btr_cur_instant_init() is in progress */
+	bool		in_instant_init;
+	uint32_t	magic_n;/*!< magic number */
+/** Value of dict_index_t::magic_n */
+# define DICT_INDEX_MAGIC_N	76789786
+#endif
+	dict_field_t*	fields;	/*!< array of field descriptions */
+	st_mysql_ftparser*
+			parser;	/*!< fulltext parser plugin */
+
+	/** It just indicates whether newly added virtual column
+	during alter. It stores column in case of alter failure.
+	It should use heap from dict_index_t. It should be freed
+	while removing the index from table. */
+	dict_add_v_col_info* new_vcol_info;
+
+	/** During ALTER TABLE, columns that a being-added index depends on
+	and whose encoding or collation is being changed to something
+	that is compatible with the clustered index.
+	Allocated from dict_index_t::heap.
+
+	@see rollback_inplace_alter_table()
+	@see ha_innobase_inplace_ctx::col_collations */
+	col_info* change_col_info;
+
+	UT_LIST_NODE_T(dict_index_t)
+			indexes;/*!< list of indexes of the table */
+#ifdef BTR_CUR_ADAPT
+	btr_search_t*	search_info;
+				/*!< info used in optimistic searches */
+#endif /* BTR_CUR_ADAPT */
+	row_log_t*	online_log;
+				/*!< the log of modifications
+				during online index creation;
+				valid when online_status is
+				ONLINE_INDEX_CREATION */
+	/*----------------------*/
+	/** Statistics for query optimization */
+	/* @{ */
+	ib_uint64_t*	stat_n_diff_key_vals;
+				/*!< approximate number of different
+				key values for this index, for each
+				n-column prefix where 1 <= n <=
+				dict_get_n_unique(index) (the array is
+				indexed from 0 to n_uniq-1); we
+				periodically calculate new
+				estimates */
+	ib_uint64_t*	stat_n_sample_sizes;
+				/*!< number of pages that were sampled
+				to calculate each of stat_n_diff_key_vals[],
+				e.g. stat_n_sample_sizes[3] pages were sampled
+				to get the number stat_n_diff_key_vals[3]. */
+	ib_uint64_t*	stat_n_non_null_key_vals;
+				/* approximate number of non-null key values
+				for this index, for each column where
+				1 <= n <= dict_get_n_unique(index) (the array
+				is indexed from 0 to n_uniq-1); This
+				is used when innodb_stats_method is
+				"nulls_ignored". */
+	ulint		stat_index_size;
+				/*!< approximate index size in
+				database pages */
+	ulint		stat_n_leaf_pages;
+				/*!< approximate number of leaf pages in the
+				index tree */
+	bool		stats_error_printed;
+				/*!< has persistent statistics error printed
+				for this index ? */
+	/* @} */
+	/** Statistics for defragmentation, these numbers are estimations and
+	could be very inaccurate at certain times, e.g. right after restart,
+	during defragmentation, etc. */
+	/* @{ */
+	ulint		stat_defrag_modified_counter;
+	ulint		stat_defrag_n_pages_freed;
+				/* number of pages freed by defragmentation. */
+	ulint		stat_defrag_n_page_split;
+				/* number of page splits since last full index
+				defragmentation. */
+	ulint		stat_defrag_data_size_sample[STAT_DEFRAG_DATA_SIZE_N_SAMPLE];
+				/* data size when compression failure happened
+				the most recent 10 times. */
+	ulint		stat_defrag_sample_next_slot;
+				/* in which slot the next sample should be
+				saved. */
+	/* @} */
+private:
+  /** R-tree split sequence number */
+  Atomic_relaxed<node_seq_t> rtr_ssn;
+public:
+  void set_ssn(node_seq_t ssn) { rtr_ssn= ssn; }
+  node_seq_t assign_ssn() { return rtr_ssn.fetch_add(1) + 1; }
+  node_seq_t ssn() const { return rtr_ssn; }
+
+	rtr_info_track_t*
+			rtr_track;/*!< tracking all R-Tree search cursors */
+	trx_id_t	trx_id; /*!< id of the transaction that created this
+				index, or 0 if the index existed
+				when InnoDB was started up */
+	zip_pad_info_t	zip_pad;/*!< Information about state of
+				compression failures and successes */
+  /** lock protecting the non-leaf index pages */
+  mutable index_lock lock;
+
+	/** Determine if the index has been committed to the
+	data dictionary.
+	@return whether the index definition has been committed */
+	bool is_committed() const
+	{
+		ut_ad(!uncommitted || !(type & DICT_CLUSTERED));
+		return(UNIV_LIKELY(!uncommitted));
+	}
+
+	/** Flag an index committed or uncommitted.
+	@param[in]	committed	whether the index is committed */
+	void set_committed(bool committed)
+	{
+		ut_ad(!to_be_dropped);
+		ut_ad(committed || !(type & DICT_CLUSTERED));
+		ut_ad(!committed || !change_col_info);
+		uncommitted = !committed;
+	}
+
+	/** Notify that the index pages are going to be modified.
+	@param[in,out]	mtr	mini-transaction */
+	inline void set_modified(mtr_t& mtr) const;
+
+	/** @return whether this index is readable
+	@retval	true	normally
+	@retval	false	if this is a single-table tablespace
+			and the .ibd file is missing, or a
+			page cannot be read or decrypted */
+	inline bool is_readable() const;
+
+	/** @return whether instant ALTER TABLE is in effect */
+	inline bool is_instant() const;
+
+	/** @return whether the index is the primary key index
+	(not the clustered index of the change buffer) */
+	bool is_primary() const
+	{
+		return DICT_CLUSTERED == (type & (DICT_CLUSTERED | DICT_IBUF));
+	}
+
+	/** @return whether this is a generated clustered index */
+	bool is_gen_clust() const { return type == DICT_CLUSTERED; }
+
+	/** @return whether this is a clustered index */
+	bool is_clust() const { return type & DICT_CLUSTERED; }
+
+	/** @return whether this is a unique index */
+	bool is_unique() const { return type & DICT_UNIQUE; }
+
+	/** @return whether this is a spatial index */
+	bool is_spatial() const { return UNIV_UNLIKELY(type & DICT_SPATIAL); }
+
+	/** @return whether this is the change buffer */
+	bool is_ibuf() const { return UNIV_UNLIKELY(type & DICT_IBUF); }
+
+	/** @return whether this index requires locking */
+	bool has_locking() const { return !is_ibuf(); }
+
+	/** @return whether this is a normal B-tree index
+        (not the change buffer, not SPATIAL or FULLTEXT) */
+	bool is_btree() const {
+		return UNIV_LIKELY(!(type & (DICT_IBUF | DICT_SPATIAL
+					     | DICT_FTS | DICT_CORRUPT)));
+	}
+
+	/** @return whether the index includes virtual columns */
+	bool has_virtual() const { return type & DICT_VIRTUAL; }
+
+	/** @return the position of DB_TRX_ID */
+	uint16_t db_trx_id() const {
+		DBUG_ASSERT(is_primary());
+		DBUG_ASSERT(n_uniq);
+		DBUG_ASSERT(n_uniq <= MAX_REF_PARTS);
+		return n_uniq;
+	}
+	/** @return the position of DB_ROLL_PTR */
+	uint16_t db_roll_ptr() const
+	{
+		return static_cast<uint16_t>(db_trx_id() + 1);
+	}
+
+	/** @return the offset of the metadata BLOB field,
+	or the first user field after the PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR */
+	uint16_t first_user_field() const
+	{
+		return static_cast<uint16_t>(db_trx_id() + 2);
+	}
+
+	/** @return whether the index is corrupted */
+	inline bool is_corrupted() const;
+
+  /** Detach the virtual columns from the index that is to be removed. */
+  void detach_columns()
+  {
+    if (!has_virtual() || !cached)
+      return;
+    for (unsigned i= 0; i < n_fields; i++)
+    {
+      dict_col_t* col= fields[i].col;
+      if (!col || !col->is_virtual())
+        continue;
+      col->detach(*this);
+    }
+  }
+
+	/** Determine how many fields of a given prefix can be set NULL.
+	@param[in]	n_prefix	number of fields in the prefix
+	@return	number of fields 0..n_prefix-1 that can be set NULL */
+	unsigned get_n_nullable(ulint n_prefix) const
+	{
+		DBUG_ASSERT(n_prefix > 0);
+		DBUG_ASSERT(n_prefix <= n_fields);
+		unsigned n = n_nullable;
+		for (; n_prefix < n_fields; n_prefix++) {
+			const dict_col_t* col = fields[n_prefix].col;
+			DBUG_ASSERT(!col->is_virtual());
+			n -= col->is_nullable();
+		}
+		DBUG_ASSERT(n < n_def);
+		return n;
+	}
+
+	/** Get the default value of an instantly-added clustered index field.
+	@param[in]	n	instantly added field position
+	@param[out]	len	value length (in bytes), or UNIV_SQL_NULL
+	@return	default value
+	@retval	NULL	if the default value is SQL NULL (len=UNIV_SQL_NULL) */
+	const byte* instant_field_value(ulint n, ulint* len) const
+	{
+		DBUG_ASSERT(is_instant() || id == DICT_INDEXES_ID);
+		DBUG_ASSERT(n + (id == DICT_INDEXES_ID) >= n_core_fields);
+		DBUG_ASSERT(n < n_fields);
+		return fields[n].col->instant_value(len);
+	}
+
+	/** Adjust index metadata for instant ADD/DROP/reorder COLUMN.
+	@param[in]	clustered index definition after instant ALTER TABLE */
+	inline void instant_add_field(const dict_index_t& instant);
+	/** Remove instant ADD COLUMN metadata. */
+	inline void clear_instant_add();
+	/** Remove instant ALTER TABLE metadata. */
+	inline void clear_instant_alter();
+
+	/** Construct the metadata record for instant ALTER TABLE.
+	@param[in]	row	dummy or default values for existing columns
+	@param[in,out]	heap	memory heap for allocations
+	@return	metadata record */
+	inline dtuple_t*
+	instant_metadata(const dtuple_t& row, mem_heap_t* heap) const;
+
+	/** Check if record in clustered index is historical row.
+	@param[in]	rec	clustered row
+	@param[in]	offsets	offsets
+	@return true if row is historical */
+	bool
+	vers_history_row(const rec_t* rec, const rec_offs* offsets);
+
+	/** Check if record in secondary index is historical row.
+	@param[in]	rec	record in a secondary index
+	@param[out]	history_row true if row is historical
+	@return true on error */
+	bool
+	vers_history_row(const rec_t* rec, bool &history_row);
+
+  /** Assign the number of new column to be added as a part
+  of the index
+  @param        n_vcol  number of virtual columns to be added */
+  void assign_new_v_col(ulint n_vcol)
+  {
+    new_vcol_info= static_cast<dict_add_v_col_info*>
+      (mem_heap_zalloc(heap, sizeof *new_vcol_info));
+    new_vcol_info->n_v_col= n_vcol;
+  }
+
+  /* @return whether index has new virtual column */
+  bool has_new_v_col() const { return new_vcol_info; }
+
+  /* @return number of newly added virtual column */
+  ulint get_new_n_vcol() const
+  { return new_vcol_info ? new_vcol_info->n_v_col : 0; }
+
+  /** Assign the number of collation change fields as a part of the index
+  @param  n_cols   number of columns whose collation is changing */
+  void init_change_cols(unsigned n_cols)
+  {
+    ut_ad(n_fields > n_cols || type & DICT_FTS);
+    change_col_info= static_cast<col_info*>
+      (mem_heap_zalloc(heap, sizeof(col_info)));
+    change_col_info->n_cols= n_cols;
+  }
+
+  /** Reconstruct the clustered index fields.
+  @return whether metadata is incorrect */
+  inline bool reconstruct_fields();
+
+  /** Check if the index contains a column or a prefix of that column.
+  @param[in]	n		column number
+  @param[in]	is_virtual	whether it is a virtual col
+  @return whether the index contains the column or its prefix */
+  bool contains_col_or_prefix(ulint n, bool is_virtual) const
+  MY_ATTRIBUTE((warn_unused_result));
+
+#ifdef BTR_CUR_HASH_ADAPT
+  /** @return a clone of this */
+  dict_index_t* clone() const;
+  /** Clone this index for lazy dropping of the adaptive hash index.
+  @return this or a clone */
+  dict_index_t* clone_if_needed();
+  /** @return number of leaf pages pointed to by the adaptive hash index */
+  inline ulint n_ahi_pages() const;
+  /** @return whether mark_freed() had been invoked */
+  bool freed() const { return UNIV_UNLIKELY(page == 1); }
+  /** Note that the index is waiting for btr_search_lazy_free() */
+  void set_freed() { ut_ad(!freed()); page= 1; }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+  /** @return whether it is forbidden to invoke clear_instant_add() */
+  bool must_avoid_clear_instant_add() const
+  {
+    if (is_instant())
+      for (auto i= this; (i= UT_LIST_GET_NEXT(indexes, i)) != nullptr; )
+        if (i->to_be_dropped /* || i->online_log*/)
+          return true;
+    return false;
+  }
+
+	/** This ad-hoc class is used by record_size_info only.	*/
+	class record_size_info_t {
+	public:
+		record_size_info_t()
+		    : max_leaf_size(0), shortest_size(0), too_big(false),
+		      first_overrun_field_index(SIZE_T_MAX), overrun_size(0)
+		{
+		}
+
+		/** Mark row potentially too big for page and set up first
+		overflow field index. */
+		void set_too_big(size_t field_index)
+		{
+			ut_ad(field_index != SIZE_T_MAX);
+
+			too_big = true;
+			if (first_overrun_field_index > field_index) {
+				first_overrun_field_index = field_index;
+				overrun_size = shortest_size;
+			}
+		}
+
+		/** @return overrun field index or SIZE_T_MAX if nothing
+		overflowed*/
+		size_t get_first_overrun_field_index() const
+		{
+			ut_ad(row_is_too_big());
+			ut_ad(first_overrun_field_index != SIZE_T_MAX);
+			return first_overrun_field_index;
+		}
+
+		size_t get_overrun_size() const
+		{
+			ut_ad(row_is_too_big());
+			return overrun_size;
+		}
+
+		bool row_is_too_big() const { return too_big; }
+
+		size_t max_leaf_size; /** Bigger row size this index can
+				      produce */
+		size_t shortest_size; /** shortest because it counts everything
+				      as in overflow pages */
+
+	private:
+		bool too_big; /** This one is true when maximum row size this
+			      index can produce is bigger than maximum row
+			      size given page can hold. */
+		size_t first_overrun_field_index; /** After adding this field
+						  index row overflowed maximum
+						  allowed size. Useful for
+						  reporting back to user. */
+		size_t overrun_size;		  /** Just overrun row size */
+	};
+
+	/** Returns max possibly record size for that index, size of a shortest
+	everything in overflow) size of the longest possible row and index
+	of a field which made index records too big to fit on a page.*/
+	inline record_size_info_t record_size_info() const;
+
+  /** Clear the index tree and reinitialize the root page, in the
+  rollback of TRX_UNDO_EMPTY. The BTR_SEG_LEAF is freed and reinitialized.
+  @param thr query thread
+  @return error code */
+  dberr_t clear(que_thr_t *thr);
+
+  /** Check whether the online log is dummy value to indicate
+  whether table undergoes active DDL.
+  @retval true if online log is dummy value */
+  bool online_log_is_dummy() const
+  {
+    return online_log == reinterpret_cast<const row_log_t*>(this);
+  }
+
+  /** Assign clustered index online log to dummy value */
+  void online_log_make_dummy()
+  {
+    online_log= reinterpret_cast<row_log_t*>(this);
+  }
+};
+
+/** Detach a virtual column from an index.
+@param index  being-freed index */
+inline void dict_col_t::detach(const dict_index_t &index)
+{
+  if (is_virtual())
+    reinterpret_cast<dict_v_col_t*>(this)->detach(index);
+}
+
+/** Add a field definition to an index.
+@param index         index
+@param name          pointer to column name
+@param prefix_len    column prefix length, or 0
+@param descending    whether to use descending order */
+inline void dict_mem_index_add_field(dict_index_t *index, const char *name,
+                                     ulint prefix_len, bool descending= false)
+{
+  ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+  dict_field_t &field= index->fields[index->n_def++];
+  field.name= name;
+  field.prefix_len= prefix_len & ((1U << 12) - 1);
+  field.descending= descending;
+}
+
+/** The status of online index creation */
+enum online_index_status {
+	/** the index is complete and ready for access */
+	ONLINE_INDEX_COMPLETE = 0,
+	/** the index is being created, online
+	(allowing concurrent modifications) */
+	ONLINE_INDEX_CREATION,
+	/** secondary index creation was aborted and the index
+	should be dropped as soon as index->table->n_ref_count reaches 0,
+	or online table rebuild was aborted and the clustered index
+	of the original table should soon be restored to
+	ONLINE_INDEX_COMPLETE */
+	ONLINE_INDEX_ABORTED,
+	/** the online index creation was aborted, the index was
+	dropped from the data dictionary and the tablespace, and it
+	should be dropped from the data dictionary cache as soon as
+	index->table->n_ref_count reaches 0. */
+	ONLINE_INDEX_ABORTED_DROPPED
+};
+
+/** Set to store the virtual columns which are affected by Foreign
+key constraint. */
+typedef std::set<dict_v_col_t*, std::less<dict_v_col_t*>,
+		ut_allocator<dict_v_col_t*> >		dict_vcol_set;
+
+/** Data structure for a foreign key constraint; an example:
+FOREIGN KEY (A, B) REFERENCES TABLE2 (C, D).  Most fields will be
+initialized to 0, NULL or FALSE in dict_mem_foreign_create(). */
+struct dict_foreign_t{
+	mem_heap_t*	heap;		/*!< this object is allocated from
+					this memory heap */
+	char*		id;		/*!< id of the constraint as a
+					null-terminated string */
+	unsigned	n_fields:10;	/*!< number of indexes' first fields
+					for which the foreign key
+					constraint is defined: we allow the
+					indexes to contain more fields than
+					mentioned in the constraint, as long
+					as the first fields are as mentioned */
+	unsigned	type:6;		/*!< 0 or DICT_FOREIGN_ON_DELETE_CASCADE
+					or DICT_FOREIGN_ON_DELETE_SET_NULL */
+	char*		foreign_table_name;/*!< foreign table name */
+	char*		foreign_table_name_lookup;
+				/*!< foreign table name used for dict lookup */
+	dict_table_t*	foreign_table;	/*!< table where the foreign key is */
+	const char**	foreign_col_names;/*!< names of the columns in the
+					foreign key */
+	char*		referenced_table_name;/*!< referenced table name */
+	char*		referenced_table_name_lookup;
+				/*!< referenced table name for dict lookup*/
+	dict_table_t*	referenced_table;/*!< table where the referenced key
+					is */
+	const char**	referenced_col_names;/*!< names of the referenced
+					columns in the referenced table */
+	dict_index_t*	foreign_index;	/*!< foreign index; we require that
+					both tables contain explicitly defined
+					indexes for the constraint: InnoDB
+					does not generate new indexes
+					implicitly */
+	dict_index_t*	referenced_index;/*!< referenced index */
+
+	dict_vcol_set*	v_cols;		/*!< set of virtual columns affected
+					by foreign key constraint. */
+
+	/** Check whether the fulltext index gets affected by
+	foreign key constraint */
+	bool affects_fulltext() const;
+};
+
+std::ostream&
+operator<< (std::ostream& out, const dict_foreign_t& foreign);
+
+struct dict_foreign_print {
+
+	dict_foreign_print(std::ostream& out)
+		: m_out(out)
+	{}
+
+	void operator()(const dict_foreign_t* foreign) {
+		m_out << *foreign;
+	}
+private:
+	std::ostream&	m_out;
+};
+
+/** Compare two dict_foreign_t objects using their ids. Used in the ordering
+of dict_table_t::foreign_set and dict_table_t::referenced_set.  It returns
+true if the first argument is considered to go before the second in the
+strict weak ordering it defines, and false otherwise. */
+struct dict_foreign_compare {
+
+	bool operator()(
+		const dict_foreign_t*	lhs,
+		const dict_foreign_t*	rhs) const
+	{
+		return strcmp(lhs->id, rhs->id) < 0;
+	}
+};
+
+/** A function object to find a foreign key with the given index as the
+referenced index. Return the foreign key with matching criteria or NULL */
+struct dict_foreign_with_index {
+
+	dict_foreign_with_index(const dict_index_t*	index)
+	: m_index(index)
+	{}
+
+	bool operator()(const dict_foreign_t*	foreign) const
+	{
+		return(foreign->referenced_index == m_index);
+	}
+
+	const dict_index_t*	m_index;
+};
+
+/* A function object to check if the foreign constraint is between different
+tables.  Returns true if foreign key constraint is between different tables,
+false otherwise. */
+struct dict_foreign_different_tables {
+
+	bool operator()(const dict_foreign_t*	foreign) const
+	{
+		return(foreign->foreign_table != foreign->referenced_table);
+	}
+};
+
+/** A function object to check if the foreign key constraint has the same
+name as given.  If the full name of the foreign key constraint doesn't match,
+then, check if removing the database name from the foreign key constraint
+matches. Return true if it matches, false otherwise. */
+struct dict_foreign_matches_id {
+
+	dict_foreign_matches_id(const char* id)
+		: m_id(id)
+	{}
+
+	bool operator()(const dict_foreign_t*	foreign) const
+	{
+		if (0 == innobase_strcasecmp(foreign->id, m_id)) {
+			return(true);
+		}
+		if (const char* pos = strchr(foreign->id, '/')) {
+			if (0 == innobase_strcasecmp(m_id, pos + 1)) {
+				return(true);
+			}
+		}
+		return(false);
+	}
+
+	const char*	m_id;
+};
+
+typedef std::set<
+	dict_foreign_t*,
+	dict_foreign_compare,
+	ut_allocator<dict_foreign_t*> >	dict_foreign_set;
+
+std::ostream&
+operator<< (std::ostream& out, const dict_foreign_set& fk_set);
+
+/** Function object to check if a foreign key object is there
+in the given foreign key set or not.  It returns true if the
+foreign key is not found, false otherwise */
+struct dict_foreign_not_exists {
+	dict_foreign_not_exists(const dict_foreign_set& obj_)
+		: m_foreigns(obj_)
+	{}
+
+	/* Return true if the given foreign key is not found */
+	bool operator()(dict_foreign_t* const & foreign) const {
+		return(m_foreigns.find(foreign) == m_foreigns.end());
+	}
+private:
+	const dict_foreign_set&	m_foreigns;
+};
+
+/** Validate the search order in the foreign key set.
+@param[in]	fk_set	the foreign key set to be validated
+@return true if search order is fine in the set, false otherwise. */
+bool
+dict_foreign_set_validate(
+	const dict_foreign_set&	fk_set);
+
+/** Validate the search order in the foreign key sets of the table
+(foreign_set and referenced_set).
+@param[in]	table	table whose foreign key sets are to be validated
+@return true if foreign key sets are fine, false otherwise. */
+bool
+dict_foreign_set_validate(
+	const dict_table_t&	table);
+
+/*********************************************************************//**
+Frees a foreign key struct. */
+inline
+void
+dict_foreign_free(
+/*==============*/
+	dict_foreign_t*	foreign)	/*!< in, own: foreign key struct */
+{
+	if (foreign->v_cols != NULL) {
+		UT_DELETE(foreign->v_cols);
+	}
+
+	mem_heap_free(foreign->heap);
+}
+
+/** The destructor will free all the foreign key constraints in the set
+by calling dict_foreign_free() on each of the foreign key constraints.
+This is used to free the allocated memory when a local set goes out
+of scope. */
+struct dict_foreign_set_free {
+
+	dict_foreign_set_free(const dict_foreign_set&	foreign_set)
+		: m_foreign_set(foreign_set)
+	{}
+
+	~dict_foreign_set_free()
+	{
+		std::for_each(m_foreign_set.begin(),
+			      m_foreign_set.end(),
+			      dict_foreign_free);
+	}
+
+	const dict_foreign_set&	m_foreign_set;
+};
+
+/** The flags for ON_UPDATE and ON_DELETE can be ORed; the default is that
+a foreign key constraint is enforced, therefore RESTRICT just means no flag */
+/* @{ */
+#define DICT_FOREIGN_ON_DELETE_CASCADE	1U	/*!< ON DELETE CASCADE */
+#define DICT_FOREIGN_ON_DELETE_SET_NULL	2U	/*!< ON UPDATE SET NULL */
+#define DICT_FOREIGN_ON_UPDATE_CASCADE	4U	/*!< ON DELETE CASCADE */
+#define DICT_FOREIGN_ON_UPDATE_SET_NULL	8U	/*!< ON UPDATE SET NULL */
+#define DICT_FOREIGN_ON_DELETE_NO_ACTION 16U	/*!< ON DELETE NO ACTION */
+#define DICT_FOREIGN_ON_UPDATE_NO_ACTION 32U	/*!< ON UPDATE NO ACTION */
+/* @} */
+
+/** Display an identifier.
+@param[in,out]	s	output stream
+@param[in]	id_name	SQL identifier (other than table name)
+@return the output stream */
+std::ostream&
+operator<<(
+	std::ostream&		s,
+	const id_name_t&	id_name);
+
+/** Display a table name.
+@param[in,out]	s		output stream
+@param[in]	table_name	table name
+@return the output stream */
+std::ostream&
+operator<<(
+	std::ostream&		s,
+	const table_name_t&	table_name);
+
+/** List of locks that different transactions have acquired on a table. This
+list has a list node that is embedded in a nested union/structure. We have to
+generate a specific template for it. */
+
+typedef ut_list_base<lock_t, ut_list_node<lock_t> lock_table_t::*>
+	table_lock_list_t;
+
+/** mysql template structure defined in row0mysql.cc */
+struct mysql_row_templ_t;
+
+/** Structure defines template related to virtual columns and
+their base columns */
+struct dict_vcol_templ_t {
+	/** number of regular columns */
+	ulint			n_col;
+
+	/** number of virtual columns */
+	ulint			n_v_col;
+
+	/** array of templates for virtual col and their base columns */
+	mysql_row_templ_t**	vtempl;
+
+	/** table's database name */
+	std::string		db_name;
+
+	/** table name */
+	std::string		tb_name;
+
+	/** MySQL record length */
+	ulint			rec_len;
+
+	/** default column value if any */
+	byte*			default_rec;
+
+	/** cached MySQL TABLE object */
+	TABLE*			mysql_table;
+
+	/** when mysql_table was cached */
+	uint64_t		mysql_table_query_id;
+
+	dict_vcol_templ_t() : vtempl(0), mysql_table_query_id(~0ULL) {}
+};
+
+/** Metadata on clustered index fields starting from first_user_field() */
+class field_map_element_t
+{
+	/** Number of bits for representing a column number */
+	static constexpr uint16_t IND_BITS = 10;
+
+	/** Set if the column of the field has been instantly dropped */
+	static constexpr uint16_t DROPPED = 1U << (IND_BITS + 5);
+
+	/** Set if the column was dropped and originally declared NOT NULL */
+	static constexpr uint16_t NOT_NULL = 1U << (IND_BITS + 4);
+
+	/** Column index (if !(data & DROPPED)): table->cols[data & IND],
+	or field length (if (data & DROPPED)):
+	(data & IND) = 0 if variable-length with max_len < 256 bytes;
+	(data & IND) = 1 if variable-length with max_len > 255 bytes;
+	(data & IND) = 1 + L otherwise, with L=fixed length of the column */
+	static constexpr uint16_t IND = (1U << IND_BITS) - 1;
+
+	/** Field metadata */
+	uint16_t data;
+
+	void clear_not_null() { data &= uint16_t(~NOT_NULL); }
+public:
+	bool is_dropped() const { return data & DROPPED; }
+	void set_dropped() { data |= DROPPED; }
+	bool is_not_null() const { return data & NOT_NULL; }
+	void set_not_null() { ut_ad(is_dropped()); data |= NOT_NULL; }
+	uint16_t ind() const { return data & IND; }
+	void set_ind(uint16_t i)
+	{
+		DBUG_ASSERT(i <= IND);
+		DBUG_ASSERT(!ind());
+		data |= i;
+	}
+	field_map_element_t& operator= (uint16_t value)
+	{
+		data = value;
+		return *this;
+	}
+	operator uint16_t() { return data; }
+};
+
+static_assert(sizeof(field_map_element_t) == 2,
+	      "Size mismatch for a persistent data item!");
+
+/** Instantly dropped or reordered columns */
+struct dict_instant_t
+{
+	/** Number of dropped columns */
+	unsigned n_dropped;
+	/** Dropped columns */
+	dict_col_t* dropped;
+	/** Map of clustered index non-PK fields[i - first_user_field()]
+	to table columns */
+	field_map_element_t* field_map;
+};
+
+/** These are used when MySQL FRM and InnoDB data dictionary are
+in inconsistent state. */
+typedef enum {
+	DICT_FRM_CONSISTENT = 0,	/*!< Consistent state */
+	DICT_FRM_NO_PK = 1,		/*!< MySQL has no primary key
+					but InnoDB dictionary has
+					non-generated one. */
+	DICT_NO_PK_FRM_HAS = 2,		/*!< MySQL has primary key but
+					InnoDB dictionary has not. */
+	DICT_FRM_INCONSISTENT_KEYS = 3	/*!< Key count mismatch */
+} dict_frm_t;
+
+/** Data structure for a database table.  Most fields will be
+zero-initialized in dict_table_t::create(). */
+struct dict_table_t {
+
+	/** Get reference count.
+	@return current value of n_ref_count */
+	inline uint32_t get_ref_count() const { return n_ref_count; }
+
+	/** Acquire the table handle. */
+	inline void acquire();
+
+	/** Release the table handle.
+	@return	whether the last handle was released */
+	inline bool release();
+
+	/** @return whether the table supports transactions */
+	bool no_rollback() const
+	{
+		return !(~unsigned(flags) & DICT_TF_MASK_NO_ROLLBACK);
+        }
+	/** @return whether this is a temporary table */
+	bool is_temporary() const
+	{
+		return flags2 & DICT_TF2_TEMPORARY;
+	}
+
+	/** @return whether the table is not in ROW_FORMAT=REDUNDANT */
+	bool not_redundant() const { return flags & DICT_TF_COMPACT; }
+
+	/** @return whether this table is readable
+	@retval	true	normally
+	@retval	false	if this is a single-table tablespace
+			and the .ibd file is missing, or a
+			page cannot be read or decrypted */
+	bool is_readable() const
+	{
+		ut_ad(file_unreadable || space);
+		return(UNIV_LIKELY(!file_unreadable));
+	}
+
+	/** @return whether the table is accessible */
+	bool is_accessible() const
+	{
+		return UNIV_LIKELY(is_readable() && !corrupted && space)
+			&& !space->is_stopping();
+	}
+
+	/** Check if a table name contains the string "/#sql"
+	which denotes temporary or intermediate tables in MariaDB. */
+	static bool is_temporary_name(const char* name)
+	{
+		return strstr(name, "/#sql");
+	}
+
+	/** @return whether instant ALTER TABLE is in effect */
+	bool is_instant() const
+	{
+		return(UT_LIST_GET_FIRST(indexes)->is_instant());
+	}
+
+	/** @return whether the table supports instant ALTER TABLE */
+	bool supports_instant() const
+	{
+		return(!(flags & DICT_TF_MASK_ZIP_SSIZE));
+	}
+
+	/** @return the number of instantly dropped columns */
+	unsigned n_dropped() const { return instant ? instant->n_dropped : 0; }
+
+	/** Look up an old column.
+	@param[in]	cols	the old columns of the table
+	@param[in]	col_map	map from old table columns to altered ones
+	@param[in]	n_cols	number of old columns
+	@param[in]	i	the number of the new column
+	@return	old column
+	@retval	NULL	if column i was added to the table */
+	static const dict_col_t* find(const dict_col_t* cols,
+				      const ulint* col_map, ulint n_cols,
+				      ulint i)
+	{
+		for (ulint o = n_cols; o--; ) {
+			if (col_map[o] == i) {
+				return &cols[o];
+			}
+		}
+		return NULL;
+	}
+
+	/** Serialise metadata of dropped or reordered columns.
+	@param[in,out]	heap	memory heap for allocation
+	@param[out]	field	data field with the metadata */
+	inline void serialise_columns(mem_heap_t* heap, dfield_t* field) const;
+
+	/** Reconstruct dropped or reordered columns.
+	@param[in]	metadata	data from serialise_columns()
+	@param[in]	len		length of the metadata, in bytes
+	@return whether parsing the metadata failed */
+	bool deserialise_columns(const byte* metadata, ulint len);
+
+	/** Set is_instant() before instant_column().
+	@param[in]	old		previous table definition
+	@param[in]	col_map		map from old.cols[]
+					and old.v_cols[] to this
+	@param[out]	first_alter_pos	0, or
+					1 + first changed column position */
+	inline void prepare_instant(const dict_table_t& old,
+				    const ulint* col_map,
+				    unsigned& first_alter_pos);
+
+	/** Adjust table metadata for instant ADD/DROP/reorder COLUMN.
+	@param[in]	table	table on which prepare_instant() was invoked
+	@param[in]	col_map	mapping from cols[] and v_cols[] to table
+	@return		whether the metadata record must be updated */
+	inline bool instant_column(const dict_table_t& table,
+				   const ulint* col_map);
+
+	/** Roll back instant_column().
+	@param[in]	old_n_cols		original n_cols
+	@param[in]	old_cols		original cols
+	@param[in]	old_col_names		original col_names
+	@param[in]	old_instant		original instant structure
+	@param[in]	old_fields		original fields
+	@param[in]	old_n_fields		original number of fields
+	@param[in]	old_n_core_fields	original number of core fields
+	@param[in]	old_n_v_cols		original n_v_cols
+	@param[in]	old_v_cols		original v_cols
+	@param[in]	old_v_col_names		original v_col_names
+	@param[in]	col_map			column map */
+	inline void rollback_instant(
+		unsigned	old_n_cols,
+		dict_col_t*	old_cols,
+		const char*	old_col_names,
+		dict_instant_t*	old_instant,
+		dict_field_t*	old_fields,
+		unsigned	old_n_fields,
+		unsigned	old_n_core_fields,
+		unsigned	old_n_v_cols,
+		dict_v_col_t*	old_v_cols,
+		const char*	old_v_col_names,
+		const ulint*	col_map);
+
+	/** Add the table definition to the data dictionary cache */
+	void add_to_cache();
+
+	/** @return whether the table is versioned.
+	It is assumed that both vers_start and vers_end set to 0
+	iff table is not versioned. In any other case,
+	these fields correspond to actual positions in cols[]. */
+	bool versioned() const { return vers_start || vers_end; }
+	bool versioned_by_id() const
+	{
+		return versioned() && cols[vers_start].mtype == DATA_INT;
+	}
+
+	/** For overflow fields returns potential max length stored inline */
+	inline size_t get_overflow_field_local_len() const;
+
+  /** Parse the table file name into table name and database name.
+  @tparam        dict_frozen  whether the caller holds dict_sys.latch
+  @param[in,out] db_name      database name buffer
+  @param[in,out] tbl_name     table name buffer
+  @param[out] db_name_len     database name length
+  @param[out] tbl_name_len    table name length
+  @return whether the table name is visible to SQL */
+  template<bool dict_frozen= false>
+  bool parse_name(char (&db_name)[NAME_LEN + 1],
+                  char (&tbl_name)[NAME_LEN + 1],
+                  size_t *db_name_len, size_t *tbl_name_len) const;
+
+  /** Clear the table when rolling back TRX_UNDO_EMPTY
+  @return error code */
+  dberr_t clear(que_thr_t *thr);
+
+#ifdef UNIV_DEBUG
+  /** @return whether the current thread holds the lock_mutex */
+  bool lock_mutex_is_owner() const
+  { return lock_mutex_owner == pthread_self(); }
+  /** @return whether the current thread holds the stats_mutex (lock_mutex) */
+  bool stats_mutex_is_owner() const
+  { return lock_mutex_owner == pthread_self(); }
+#endif /* UNIV_DEBUG */
+  void lock_mutex_init() { lock_mutex.init(); }
+  void lock_mutex_destroy() { lock_mutex.destroy(); }
+  /** Acquire lock_mutex */
+  void lock_mutex_lock()
+  {
+    ut_ad(!lock_mutex_is_owner());
+    lock_mutex.wr_lock();
+    ut_ad(!lock_mutex_owner.exchange(pthread_self()));
+  }
+  /** Try to acquire lock_mutex */
+  bool lock_mutex_trylock()
+  {
+    ut_ad(!lock_mutex_is_owner());
+    bool acquired= lock_mutex.wr_lock_try();
+    ut_ad(!acquired || !lock_mutex_owner.exchange(pthread_self()));
+    return acquired;
+  }
+  /** Release lock_mutex */
+  void lock_mutex_unlock()
+  {
+    ut_ad(lock_mutex_owner.exchange(0) == pthread_self());
+    lock_mutex.wr_unlock();
+  }
+#ifndef SUX_LOCK_GENERIC
+  /** @return whether the lock mutex is held by some thread */
+  bool lock_mutex_is_locked() const noexcept { return lock_mutex.is_locked(); }
+#endif
+
+  /* stats mutex lock currently defaults to lock_mutex but in the future,
+  there could be a use-case to have separate mutex for stats.
+  extra indirection (through inline so no performance hit) should
+  help simplify code and increase long-term maintainability */
+  void stats_mutex_init() { lock_mutex_init(); }
+  void stats_mutex_destroy() { lock_mutex_destroy(); }
+  void stats_mutex_lock() { lock_mutex_lock(); }
+  void stats_mutex_unlock() { lock_mutex_unlock(); }
+
+  /** Rename the data file.
+  @param new_name     name of the table
+  @param replace      whether to replace the file with the new name
+                      (as part of rolling back TRUNCATE) */
+  dberr_t rename_tablespace(span<const char> new_name, bool replace) const;
+
+private:
+	/** Initialize instant->field_map.
+	@param[in]	table	table definition to copy from */
+	inline void init_instant(const dict_table_t& table);
+public:
+	/** Id of the table. */
+	table_id_t				id;
+	/** dict_sys.id_hash chain node */
+	dict_table_t*				id_hash;
+	/** Table name in name_hash */
+	table_name_t				name;
+	/** dict_sys.name_hash chain node */
+	dict_table_t*				name_hash;
+
+	/** Memory heap */
+	mem_heap_t*				heap;
+
+	/** NULL or the directory path specified by DATA DIRECTORY. */
+	char*					data_dir_path;
+
+	/** The tablespace of the table */
+	fil_space_t*				space;
+	/** Tablespace ID */
+	uint32_t				space_id;
+
+	/** Stores information about:
+	1 row format (redundant or compact),
+	2 compressed page size (zip shift size),
+	3 whether using atomic blobs,
+	4 whether the table has been created with the option DATA DIRECTORY.
+	Use DICT_TF_GET_COMPACT(), DICT_TF_GET_ZIP_SSIZE(),
+	DICT_TF_HAS_ATOMIC_BLOBS() and DICT_TF_HAS_DATA_DIR() to parse this
+	flag. */
+	unsigned				flags:DICT_TF_BITS;
+
+	/** Stores information about:
+	1 whether the table has been created using CREATE TEMPORARY TABLE,
+	2 whether the table has an internally defined DOC ID column,
+	3 whether the table has a FTS index,
+	4 whether DOC ID column need to be added to the FTS index,
+	5 whether the table is being created its own tablespace,
+	6 whether the table has been DISCARDed,
+	7 whether the aux FTS tables names are in hex.
+	Use DICT_TF2_FLAG_IS_SET() to parse this flag. */
+	unsigned				flags2:DICT_TF2_BITS;
+
+	/** TRUE if the table is an intermediate table during copy alter
+	operation or a partition/subpartition which is required for copying
+	data and skip the undo log for insertion of row in the table.
+	This variable will be set and unset during extra(), or during the
+	process of altering partitions */
+	unsigned                                skip_alter_undo:1;
+
+	/*!< whether this is in a single-table tablespace and the .ibd
+	file is missing or page decryption failed and page is corrupted */
+	unsigned				file_unreadable:1;
+
+	/** TRUE if the table object has been added to the dictionary cache. */
+	unsigned				cached:1;
+
+	/** Number of non-virtual columns defined so far. */
+	unsigned				n_def:10;
+
+	/** Number of non-virtual columns. */
+	unsigned				n_cols:10;
+
+	/** Number of total columns (inlcude virtual and non-virtual) */
+	unsigned				n_t_cols:10;
+
+	/** Number of total columns defined so far. */
+	unsigned                                n_t_def:10;
+
+	/** Number of virtual columns defined so far. */
+	unsigned                                n_v_def:10;
+
+	/** Number of virtual columns. */
+	unsigned                                n_v_cols:10;
+
+	/** 1 + the position of autoinc counter field in clustered
+	index, or 0 if there is no persistent AUTO_INCREMENT column in
+	the table. */
+	unsigned				persistent_autoinc:10;
+
+	/** TRUE if it's not an InnoDB system table or a table that has no FK
+	relationships. */
+	unsigned				can_be_evicted:1;
+
+	/** TRUE if table is corrupted. */
+	unsigned				corrupted:1;
+
+	/** TRUE if some indexes should be dropped after ONLINE_INDEX_ABORTED
+	or ONLINE_INDEX_ABORTED_DROPPED. */
+	unsigned				drop_aborted:1;
+
+	/** Array of column descriptions. */
+	dict_col_t*				cols;
+
+	/** Array of virtual column descriptions. */
+	dict_v_col_t*				v_cols;
+
+	/** List of stored column descriptions. It is used only for foreign key
+	check during create table and copy alter operations.
+	During copy alter, s_cols list is filled during create table operation
+	and need to preserve till rename table operation. That is the
+	reason s_cols is a part of dict_table_t */
+	dict_s_col_list*			s_cols;
+
+	/** Instantly dropped or reordered columns, or NULL if none */
+	dict_instant_t*				instant;
+
+	/** Column names packed in a character string
+	"name1\0name2\0...nameN\0". Until the string contains n_cols, it will
+	be allocated from a temporary heap. The final string will be allocated
+	from table->heap. */
+	const char*				col_names;
+
+	/** Virtual column names */
+	const char*				v_col_names;
+	unsigned	vers_start:10;
+				/*!< System Versioning: row start col index */
+	unsigned	vers_end:10;
+				/*!< System Versioning: row end col index */
+	bool		is_system_db;
+				/*!< True if the table belongs to a system
+				database (mysql, information_schema or
+				performance_schema) */
+	dict_frm_t	dict_frm_mismatch;
+				/*!< !DICT_FRM_CONSISTENT==0 if data
+				dictionary information and
+				MySQL FRM information mismatch. */
+	/** The FTS_DOC_ID_INDEX, or NULL if no fulltext indexes exist */
+	dict_index_t*				fts_doc_id_index;
+
+	/** List of indexes of the table. */
+	UT_LIST_BASE_NODE_T(dict_index_t)	indexes;
+#ifdef BTR_CUR_HASH_ADAPT
+	/** List of detached indexes that are waiting to be freed along with
+	the last adaptive hash index entry.
+	Protected by autoinc_mutex (sic!) */
+	UT_LIST_BASE_NODE_T(dict_index_t)	freed_indexes;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	/** List of foreign key constraints in the table. These refer to
+	columns in other tables. */
+	UT_LIST_BASE_NODE_T(dict_foreign_t)	foreign_list;
+
+	/** List of foreign key constraints which refer to this table. */
+	UT_LIST_BASE_NODE_T(dict_foreign_t)	referenced_list;
+
+	/** Node of the LRU list of tables. */
+	UT_LIST_NODE_T(dict_table_t)		table_LRU;
+
+	/** Maximum recursive level we support when loading tables chained
+	together with FK constraints. If exceeds this level, we will stop
+	loading child table into memory along with its parent table. */
+	byte					fk_max_recusive_level;
+
+  /** DDL transaction that last touched the table definition, or 0 if
+  no history is available. This includes possible changes in
+  ha_innobase::prepare_inplace_alter_table() and
+  ha_innobase::commit_inplace_alter_table(). */
+  trx_id_t def_trx_id;
+
+  /** Last transaction that inserted into an empty table.
+  Updated while holding exclusive table lock and an exclusive
+  latch on the clustered index root page (which must also be
+  an empty leaf page), and an ahi_latch (if btr_search_enabled). */
+  Atomic_relaxed<trx_id_t> bulk_trx_id;
+
+  /** Original table name, for MDL acquisition in purge. Normally,
+  this points to the same as name. When is_temporary_name(name.m_name) holds,
+  this should be a copy of the original table name, allocated from heap. */
+  table_name_t mdl_name;
+
+	/*!< set of foreign key constraints in the table; these refer to
+	columns in other tables */
+	dict_foreign_set			foreign_set;
+
+	/*!< set of foreign key constraints which refer to this table */
+	dict_foreign_set			referenced_set;
+
+	/** Statistics for query optimization. Mostly protected by
+	dict_sys.latch and stats_mutex_lock(). @{ */
+
+	/** TRUE if statistics have been calculated the first time after
+	database startup or table creation. */
+	unsigned				stat_initialized:1;
+
+	/** Timestamp of last recalc of the stats. */
+	time_t					stats_last_recalc;
+
+	/** The two bits below are set in the 'stat_persistent' member. They
+	have the following meaning:
+	1. _ON=0, _OFF=0, no explicit persistent stats setting for this table,
+	the value of the global srv_stats_persistent is used to determine
+	whether the table has persistent stats enabled or not
+	2. _ON=0, _OFF=1, persistent stats are explicitly disabled for this
+	table, regardless of the value of the global srv_stats_persistent
+	3. _ON=1, _OFF=0, persistent stats are explicitly enabled for this
+	table, regardless of the value of the global srv_stats_persistent
+	4. _ON=1, _OFF=1, not allowed, we assert if this ever happens. */
+	#define DICT_STATS_PERSISTENT_ON	(1 << 1)
+	#define DICT_STATS_PERSISTENT_OFF	(1 << 2)
+
+	/** Indicates whether the table uses persistent stats or not. See
+	DICT_STATS_PERSISTENT_ON and DICT_STATS_PERSISTENT_OFF. */
+	ib_uint32_t				stat_persistent;
+
+	/** The two bits below are set in the 'stats_auto_recalc' member. They
+	have the following meaning:
+	1. _ON=0, _OFF=0, no explicit auto recalc setting for this table, the
+	value of the global srv_stats_persistent_auto_recalc is used to
+	determine whether the table has auto recalc enabled or not
+	2. _ON=0, _OFF=1, auto recalc is explicitly disabled for this table,
+	regardless of the value of the global srv_stats_persistent_auto_recalc
+	3. _ON=1, _OFF=0, auto recalc is explicitly enabled for this table,
+	regardless of the value of the global srv_stats_persistent_auto_recalc
+	4. _ON=1, _OFF=1, not allowed, we assert if this ever happens. */
+	#define DICT_STATS_AUTO_RECALC_ON	(1 << 1)
+	#define DICT_STATS_AUTO_RECALC_OFF	(1 << 2)
+
+	/** Indicates whether the table uses automatic recalc for persistent
+	stats or not. See DICT_STATS_AUTO_RECALC_ON and
+	DICT_STATS_AUTO_RECALC_OFF. */
+	ib_uint32_t				stats_auto_recalc;
+
+	/** The number of pages to sample for this table during persistent
+	stats estimation. If this is 0, then the value of the global
+	srv_stats_persistent_sample_pages will be used instead. */
+	ulint					stats_sample_pages;
+
+	/** Approximate number of rows in the table. We periodically calculate
+	new estimates. */
+	ib_uint64_t				stat_n_rows;
+
+	/** Approximate clustered index size in database pages. */
+	ulint					stat_clustered_index_size;
+
+	/** Approximate size of other indexes in database pages. */
+	ulint					stat_sum_of_other_index_sizes;
+
+	/** How many rows are modified since last stats recalc. When a row is
+	inserted, updated, or deleted, we add 1 to this number; we calculate
+	new estimates for the table and the indexes if the table has changed
+	too much, see dict_stats_update_if_needed(). The counter is reset
+	to zero at statistics calculation. This counter is not protected by
+	any latch, because this is only used for heuristics. */
+	ib_uint64_t				stat_modified_counter;
+
+	bool		stats_error_printed;
+				/*!< Has persistent stats error beein
+				already printed for this table ? */
+	/* @} */
+
+	/** AUTOINC related members. @{ */
+
+	/* The actual collection of tables locked during AUTOINC read/write is
+	kept in trx_t. In order to quickly determine whether a transaction has
+	locked the AUTOINC lock we keep a pointer to the transaction here in
+	the 'autoinc_trx' member. This is to avoid acquiring the
+	lock_sys.latch and scanning the vector in trx_t.
+	When an AUTOINC lock has to wait, the corresponding lock instance is
+	created on the trx lock heap rather than use the pre-allocated instance
+	in autoinc_lock below. */
+
+	/** A buffer for an AUTOINC lock for this table. We allocate the
+	memory here so that individual transactions can get it and release it
+	without a need to allocate space from the lock heap of the trx:
+	otherwise the lock heap would grow rapidly if we do a large insert
+	from a select. */
+	lock_t*					autoinc_lock;
+
+  /** Mutex protecting autoinc and freed_indexes. */
+  srw_spin_mutex autoinc_mutex;
+private:
+  /** Mutex protecting locks on this table. */
+  srw_spin_mutex lock_mutex;
+#ifdef UNIV_DEBUG
+  /** The owner of lock_mutex (0 if none) */
+  Atomic_relaxed<pthread_t> lock_mutex_owner{0};
+#endif
+public:
+  /** Autoinc counter value to give to the next inserted row. */
+  uint64_t autoinc;
+
+  /** The transaction that currently holds the the AUTOINC lock on this table.
+  Protected by lock_mutex.
+  The thread that is executing autoinc_trx may read this field without
+  holding a latch, in row_lock_table_autoinc_for_mysql().
+  Only the autoinc_trx thread may clear this field; it cannot be
+  modified on the behalf of a transaction that is being handled by a
+  different thread. */
+  Atomic_relaxed<const trx_t*> autoinc_trx;
+
+  /** Number of granted or pending autoinc_lock on this table. This
+  value is set after acquiring lock_sys.latch but
+  in innodb_autoinc_lock_mode=1 (the default),
+  ha_innobase::innobase_lock_autoinc() will perform a dirty read
+  to determine whether other transactions have acquired the autoinc_lock. */
+  uint32_t n_waiting_or_granted_auto_inc_locks;
+
+	/* @} */
+
+  /** Number of granted or pending LOCK_S or LOCK_X on the table.
+  Protected by lock_sys.assert_locked(*this). */
+  uint32_t n_lock_x_or_s;
+
+	/** FTS specific state variables. */
+	fts_t*					fts;
+
+	/** Quiescing states, protected by the dict_index_t::lock. ie. we can
+	only change the state if we acquire all the latches (dict_index_t::lock)
+	in X mode of this table's indexes. */
+	ib_quiesce_t				quiesce;
+
+  /** Count of the number of record locks on this table. We use this to
+  determine whether we can evict the table from the dictionary cache.
+  Modified when lock_sys.is_writer(), or
+  lock_sys.assert_locked(page_id) and trx->mutex_is_owner() hold.
+  @see trx_lock_t::trx_locks */
+  Atomic_counter<uint32_t> n_rec_locks;
+private:
+  /** Count of how many handles are opened to this table. Dropping of the
+  table is NOT allowed until this count gets to zero. MySQL does NOT
+  itself check the number of open handles at DROP. */
+  Atomic_counter<uint32_t> n_ref_count;
+public:
+  /** List of locks on the table. Protected by lock_sys.assert_locked(lock). */
+  table_lock_list_t locks;
+
+  /** Timestamp of the last modification of this table. */
+  Atomic_relaxed<time_t> update_time;
+  /** Transactions whose view low limit is greater than this number are
+  not allowed to access the MariaDB query cache.
+  @see innobase_query_caching_table_check_low()
+  @see trx_t::commit_tables() */
+  Atomic_relaxed<trx_id_t> query_cache_inv_trx_id;
+
+#ifdef UNIV_DEBUG
+	/** Value of 'magic_n'. */
+	#define DICT_TABLE_MAGIC_N		76333786
+
+	/** Magic number. */
+	ulint					magic_n;
+#endif /* UNIV_DEBUG */
+	/** mysql_row_templ_t for base columns used for compute the virtual
+	columns */
+	dict_vcol_templ_t*			vc_templ;
+
+  /* @return whether the table has any other transcation lock
+  other than the given transaction */
+  bool has_lock_other_than(const trx_t *trx) const
+  {
+    for (lock_t *lock= UT_LIST_GET_FIRST(locks); lock;
+         lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock))
+      if (lock->trx != trx)
+        return true;
+    return false;
+  }
+
+  /** @return whether a DDL operation is in progress on this table */
+  bool is_active_ddl() const
+  {
+    return UT_LIST_GET_FIRST(indexes)->online_log;
+  }
+
+  /** @return whether the name is
+  mysql.innodb_index_stats or mysql.innodb_table_stats */
+  bool is_stats_table() const;
+
+  /** @return number of unique columns in FTS_DOC_ID index */
+  unsigned fts_n_uniq() const { return versioned() ? 2 : 1; }
+
+  /** Create metadata.
+  @param name     table name
+  @param space    tablespace
+  @param n_cols   total number of columns (both virtual and non-virtual)
+  @param n_v_cols number of virtual columns
+  @param flags    table flags
+  @param flags2   table flags2
+  @return newly allocated table object */
+  static dict_table_t *create(const span<const char> &name, fil_space_t *space,
+                              ulint n_cols, ulint n_v_cols, ulint flags,
+                              ulint flags2);
+
+  /** Check whether the table has any spatial indexes */
+  bool has_spatial_index() const
+  {
+    for (auto i= UT_LIST_GET_FIRST(indexes);
+         (i= UT_LIST_GET_NEXT(indexes, i)) != nullptr; )
+      if (i->is_spatial())
+        return true;
+    return false;
+  }
+};
+
+inline void dict_index_t::set_modified(mtr_t& mtr) const
+{
+	mtr.set_named_space(table->space);
+}
+
+inline bool table_name_t::is_temporary() const
+{
+	return dict_table_t::is_temporary_name(m_name);
+}
+
+inline bool dict_index_t::is_readable() const { return table->is_readable(); }
+
+inline bool dict_index_t::is_instant() const
+{
+	ut_ad(n_core_fields > 0);
+	ut_ad(n_core_fields <= n_fields || table->n_dropped());
+	ut_ad(n_core_fields == n_fields
+	      || (type & ~(DICT_UNIQUE | DICT_CORRUPT)) == DICT_CLUSTERED);
+	ut_ad(n_core_fields == n_fields || table->supports_instant());
+	ut_ad(n_core_fields == n_fields || !table->is_temporary());
+	ut_ad(!table->instant || !table->is_temporary());
+
+	return n_core_fields != n_fields
+		|| (is_primary() && table->instant);
+}
+
+inline bool dict_index_t::is_corrupted() const
+{
+	return UNIV_UNLIKELY(online_status >= ONLINE_INDEX_ABORTED
+			     || (type & DICT_CORRUPT)
+			     || (table && table->corrupted));
+}
+
+inline void dict_index_t::clear_instant_add()
+{
+  DBUG_ASSERT(is_primary());
+  DBUG_ASSERT(is_instant());
+  DBUG_ASSERT(!table->instant);
+  for (unsigned i= n_core_fields; i < n_fields; i++)
+    fields[i].col->clear_instant();
+  n_core_fields= n_fields;
+  n_core_null_bytes= static_cast<byte>
+    (UT_BITS_IN_BYTES(static_cast<unsigned>(n_nullable)));
+}
+
+inline void dict_index_t::clear_instant_alter()
+{
+	DBUG_ASSERT(is_primary());
+	DBUG_ASSERT(n_fields == n_def);
+
+	if (!table->instant) {
+		if (is_instant()) {
+			clear_instant_add();
+		}
+		return;
+	}
+
+#ifndef DBUG_OFF
+	for (unsigned i = first_user_field(); i--; ) {
+		DBUG_ASSERT(!fields[i].col->is_dropped());
+		DBUG_ASSERT(!fields[i].col->is_nullable());
+	}
+#endif
+	const dict_col_t* ai_col = table->persistent_autoinc
+		? fields[table->persistent_autoinc - 1].col
+		: NULL;
+	dict_field_t* const begin = &fields[first_user_field()];
+	dict_field_t* end = &fields[n_fields];
+
+	for (dict_field_t* d = begin; d < end; ) {
+		/* Move fields for dropped columns to the end. */
+		if (!d->col->is_dropped()) {
+			d++;
+		} else {
+			if (d->col->is_nullable()) {
+				n_nullable--;
+			}
+
+			std::swap(*d, *--end);
+		}
+	}
+
+	DBUG_ASSERT(&fields[n_fields - table->n_dropped()] == end);
+	n_core_fields = n_fields = n_def
+		= static_cast<unsigned>(end - fields) & MAX_N_FIELDS;
+	n_core_null_bytes = static_cast<byte>(UT_BITS_IN_BYTES(n_nullable));
+	std::sort(begin, end, [](const dict_field_t& a, const dict_field_t& b)
+			      { return a.col->ind < b.col->ind; });
+	table->instant = NULL;
+	if (ai_col) {
+		auto a = std::find_if(fields, end,
+				      [ai_col](const dict_field_t& f)
+				      { return f.col == ai_col; });
+		table->persistent_autoinc = (a == end)
+			? 0
+			: (1 + static_cast<unsigned>(a - fields))
+			& MAX_N_FIELDS;
+	}
+}
+
+/** @return whether the column was instantly dropped
+@param[in] index	the clustered index */
+inline bool dict_col_t::is_dropped(const dict_index_t& index) const
+{
+	DBUG_ASSERT(index.is_primary());
+	DBUG_ASSERT(!is_dropped() == !index.table->instant);
+	DBUG_ASSERT(!is_dropped() || (this >= index.table->instant->dropped
+				      && this < index.table->instant->dropped
+				      + index.table->instant->n_dropped));
+	return is_dropped();
+}
+
+/*******************************************************************//**
+Initialise the table lock list. */
+void
+lock_table_lock_list_init(
+/*======================*/
+	table_lock_list_t*	locks);		/*!< List to initialise */
+
+/** A function object to add the foreign key constraint to the referenced set
+of the referenced table, if it exists in the dictionary cache. */
+struct dict_foreign_add_to_referenced_table {
+	void operator()(dict_foreign_t*	foreign) const
+	{
+		if (dict_table_t* table = foreign->referenced_table) {
+			std::pair<dict_foreign_set::iterator, bool>	ret
+				= table->referenced_set.insert(foreign);
+			ut_a(ret.second);
+		}
+	}
+};
+
+/** Check whether the col is used in spatial index or regular index.
+@param[in]	col	column to check
+@return spatial status */
+inline
+spatial_status_t
+dict_col_get_spatial_status(
+	const dict_col_t*	col)
+{
+	spatial_status_t	spatial_status = SPATIAL_NONE;
+
+	/* Column is not a part of any index. */
+	if (!col->ord_part) {
+		return(spatial_status);
+	}
+
+	if (DATA_GEOMETRY_MTYPE(col->mtype)) {
+		if (col->max_prefix == 0) {
+			spatial_status = SPATIAL_ONLY;
+		} else {
+			/* Any regular index on a geometry column
+			should have a prefix. */
+			spatial_status = SPATIAL_MIXED;
+		}
+	}
+
+	return(spatial_status);
+}
+
+/** Clear defragmentation summary. */
+inline void dict_stats_empty_defrag_summary(dict_index_t* index)
+{
+	index->stat_defrag_n_pages_freed = 0;
+}
+
+/** Clear defragmentation related index stats. */
+inline void dict_stats_empty_defrag_stats(dict_index_t* index)
+{
+	index->stat_defrag_modified_counter = 0;
+	index->stat_defrag_n_page_split = 0;
+}
+
+#include "dict0mem.inl"
+
+#endif /* dict0mem_h */
diff --git a/storage/innobase/include/dict0mem.inl b/storage/innobase/include/dict0mem.inl
new file mode 100644
index 00000000..d60ee5d9
--- /dev/null
+++ b/storage/innobase/include/dict0mem.inl
@@ -0,0 +1,68 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0mem.ic
+Data dictionary memory object creation
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "data0type.h"
+#include "dict0mem.h"
+#include "fil0fil.h"
+
+/**********************************************************************//**
+This function poplulates a dict_index_t index memory structure with
+supplied information. */
+UNIV_INLINE
+void
+dict_mem_fill_index_struct(
+/*=======================*/
+	dict_index_t*	index,		/*!< out: index to be filled */
+	mem_heap_t*	heap,		/*!< in: memory heap */
+	const char*	index_name,	/*!< in: index name */
+	ulint		type,		/*!< in: DICT_UNIQUE,
+					DICT_CLUSTERED, ... ORed */
+	ulint		n_fields)	/*!< in: number of fields */
+{
+
+	if (heap) {
+		index->heap = heap;
+		index->name = mem_heap_strdup(heap, index_name);
+		index->fields = (dict_field_t*) mem_heap_alloc(
+			heap, 1 + n_fields * sizeof(dict_field_t));
+	} else {
+		index->name = index_name;
+		index->heap = NULL;
+		index->fields = NULL;
+	}
+
+	index->type = type & ((1U << DICT_IT_BITS) - 1);
+	index->page = FIL_NULL;
+	index->merge_threshold = DICT_INDEX_MERGE_THRESHOLD_DEFAULT;
+	index->n_fields = static_cast<unsigned>(n_fields)
+		& index->MAX_N_FIELDS;
+	index->n_core_fields = static_cast<unsigned>(n_fields)
+		& index->MAX_N_FIELDS;
+	/* The '1 +' above prevents allocation
+	of an empty mem block */
+	index->nulls_equal = false;
+	ut_d(index->magic_n = DICT_INDEX_MAGIC_N);
+}
diff --git a/storage/innobase/include/dict0pagecompress.h b/storage/innobase/include/dict0pagecompress.h
new file mode 100644
index 00000000..f1272dc4
--- /dev/null
+++ b/storage/innobase/include/dict0pagecompress.h
@@ -0,0 +1,61 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2017, MariaDB Corporation. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0pagecompress.h
+Helper functions for extracting/storing page compression information
+to dictionary.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+#ifndef dict0pagecompress_h
+#define dict0pagecompress_h
+
+/********************************************************************//**
+Extract the page compression level from table flags.
+@return	page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_tf_get_page_compression_level(
+/*===============================*/
+	ulint	flags)			/*!< in: flags */
+	__attribute__((const));
+/********************************************************************//**
+Extract the page compression flag from table flags
+@return	page compression flag, or false if not compressed */
+UNIV_INLINE
+ibool
+dict_tf_get_page_compression(
+/*==========================*/
+	ulint	flags)			/*!< in: flags */
+	__attribute__((const));
+
+/********************************************************************//**
+Check whether the table uses the page compressed page format.
+@return	page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_page_compression_level(
+/*==============================*/
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((const));
+
+#include "dict0pagecompress.inl"
+
+#endif
diff --git a/storage/innobase/include/dict0pagecompress.inl b/storage/innobase/include/dict0pagecompress.inl
new file mode 100644
index 00000000..c959f9ca
--- /dev/null
+++ b/storage/innobase/include/dict0pagecompress.inl
@@ -0,0 +1,81 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2017, MariaDB Corporation. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0pagecompress.ic
+Inline implementation for helper functions for extracting/storing
+page compression and atomic writes information to dictionary.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+/********************************************************************//**
+Extract the page compression level from dict_table_t::flags.
+These flags are in memory, so assert that they are valid.
+@return	page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_tf_get_page_compression_level(
+/*===============================*/
+	ulint	flags)	/*!< in: flags */
+{
+        ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags);
+
+	ut_ad(page_compression_level <= 9);
+
+	return(page_compression_level);
+}
+
+/********************************************************************//**
+Check whether the table uses the page compression page format.
+@return	page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_page_compression_level(
+/*==============================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table);
+	ut_ad(dict_tf_get_page_compression(table->flags));
+
+	return(dict_tf_get_page_compression_level(table->flags));
+}
+
+/********************************************************************//**
+Check whether the table uses the page compression page format.
+@return	true if page compressed, false if not */
+UNIV_INLINE
+ibool
+dict_tf_get_page_compression(
+/*=========================*/
+	ulint	flags)	/*!< in: flags */
+{
+	return(DICT_TF_GET_PAGE_COMPRESSION(flags));
+}
+
+/********************************************************************//**
+Check whether the table uses the page compression page format.
+@return	true if page compressed, false if not */
+UNIV_INLINE
+ibool
+dict_table_is_page_compressed(
+/*==========================*/
+	const dict_table_t* table)	/*!< in: table */
+{
+	return (dict_tf_get_page_compression(table->flags));
+}
diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h
new file mode 100644
index 00000000..0dc1b984
--- /dev/null
+++ b/storage/innobase/include/dict0stats.h
@@ -0,0 +1,238 @@
+/*****************************************************************************
+
+Copyright (c) 2009, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats.h
+Code used for calculating and manipulating table statistics.
+
+Created Jan 06, 2010 Vasil Dimov
+*******************************************************/
+
+#ifndef dict0stats_h
+#define dict0stats_h
+
+#include "dict0types.h"
+#include "trx0types.h"
+
+enum dict_stats_upd_option_t {
+	DICT_STATS_RECALC_PERSISTENT,/* (re) calculate the
+				statistics using a precise and slow
+				algo and save them to the persistent
+				storage, if the persistent storage is
+				not present then emit a warning and
+				fall back to transient stats */
+	DICT_STATS_RECALC_TRANSIENT,/* (re) calculate the statistics
+				using an imprecise quick algo
+				without saving the results
+				persistently */
+	DICT_STATS_EMPTY_TABLE,	/* Write all zeros (or 1 where it makes sense)
+				into a table and its indexes' statistics
+				members. The resulting stats correspond to an
+				empty table. If the table is using persistent
+				statistics, then they are saved on disk. */
+	DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY /* fetch the stats
+				from the persistent storage if the in-memory
+				structures have not been initialized yet,
+				otherwise do nothing */
+};
+
+/*********************************************************************//**
+Set the persistent statistics flag for a given table. This is set only
+in the in-memory table object and is not saved on disk. It will be read
+from the .frm file upon first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_set_persistent(
+/*======================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	ibool		ps_on,	/*!< in: persistent stats explicitly enabled */
+	ibool		ps_off)	/*!< in: persistent stats explicitly disabled */
+	MY_ATTRIBUTE((nonnull));
+
+/** @return whether persistent statistics is enabled for a given table */
+UNIV_INLINE
+bool
+dict_stats_is_persistent_enabled(const dict_table_t* table)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Set the auto recalc flag for a given table (only honored for a persistent
+stats enabled table). The flag is set only in the in-memory table object
+and is not saved in InnoDB files. It will be read from the .frm file upon
+first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_auto_recalc_set(
+/*=======================*/
+	dict_table_t*	table,			/*!< in/out: table */
+	ibool		auto_recalc_on,		/*!< in: explicitly enabled */
+	ibool		auto_recalc_off);	/*!< in: explicitly disabled */
+
+/** @return whether auto recalc is enabled for a given table*/
+UNIV_INLINE
+bool
+dict_stats_auto_recalc_is_enabled(const dict_table_t* table)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Initialize table's stats for the first time when opening a table. */
+UNIV_INLINE
+void
+dict_stats_init(
+/*============*/
+	dict_table_t*	table);	/*!< in/out: table */
+
+/*********************************************************************//**
+Deinitialize table's stats after the last close of the table. This is
+used to detect "FLUSH TABLE" and refresh the stats upon next open. */
+UNIV_INLINE
+void
+dict_stats_deinit(
+/*==============*/
+	dict_table_t*	table)	/*!< in/out: table */
+	MY_ATTRIBUTE((nonnull));
+
+#ifdef WITH_WSREP
+/** Update the table modification counter and if necessary,
+schedule new estimates for table and index statistics to be calculated.
+@param[in,out]	table	persistent or temporary table
+@param[in]	trx	transaction */
+void dict_stats_update_if_needed(dict_table_t *table, const trx_t &trx)
+	MY_ATTRIBUTE((nonnull));
+#else
+/** Update the table modification counter and if necessary,
+schedule new estimates for table and index statistics to be calculated.
+@param[in,out]	table	persistent or temporary table */
+void dict_stats_update_if_needed_func(dict_table_t *table)
+	MY_ATTRIBUTE((nonnull));
+# define dict_stats_update_if_needed(t,trx) dict_stats_update_if_needed_func(t)
+#endif
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization.
+@return DB_* error code or DB_SUCCESS */
+dberr_t
+dict_stats_update(
+/*==============*/
+	dict_table_t*		table,	/*!< in/out: table */
+	dict_stats_upd_option_t	stats_upd_option);
+					/*!< in: whether to (re) calc
+					the stats or to fetch them from
+					the persistent storage */
+
+/** Execute DELETE FROM mysql.innodb_table_stats
+@param database_name  database name
+@param table_name     table name
+@param trx            transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_delete_from_table_stats(const char *database_name,
+                                           const char *table_name,
+                                           trx_t *trx)
+  MY_ATTRIBUTE((nonnull));
+/** Execute DELETE FROM mysql.innodb_index_stats
+@param database_name  database name
+@param table_name     table name
+@param trx            transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_delete_from_index_stats(const char *database_name,
+                                           const char *table_name,
+                                           trx_t *trx)
+  MY_ATTRIBUTE((nonnull));
+/** Execute DELETE FROM mysql.innodb_index_stats
+@param database_name  database name
+@param table_name     table name
+@param index_name     name of the index
+@param trx            transaction (nullptr=start and commit a new one)
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_delete_from_index_stats(const char *database_name,
+                                           const char *table_name,
+                                           const char *index_name, trx_t *trx);
+
+/*********************************************************************//**
+Fetches or calculates new estimates for index statistics. */
+void
+dict_stats_update_for_index(
+/*========================*/
+	dict_index_t*	index)	/*!< in/out: index */
+	MY_ATTRIBUTE((nonnull));
+
+/** Rename a table in InnoDB persistent stats storage.
+@param old_name  old table name
+@param new_name  new table name
+@param trx       transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_rename_table(const char *old_name, const char *new_name,
+                                trx_t *trx);
+/** Rename an index in InnoDB persistent statistics.
+@param db         database name
+@param table      table name
+@param old_name   old table name
+@param new_name   new table name
+@param trx        transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_rename_index(const char *db, const char *table,
+                                const char *old_name, const char *new_name,
+                                trx_t *trx);
+
+/** Delete all persistent statistics for a database.
+@param db    database name
+@param trx   transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_delete(const char *db, trx_t *trx);
+
+/** Save an individual index's statistic into the persistent statistics
+storage.
+@param[in]	index			index to be updated
+@param[in]	last_update		timestamp of the stat
+@param[in]	stat_name		name of the stat
+@param[in]	stat_value		value of the stat
+@param[in]	sample_size		n pages sampled or NULL
+@param[in]	stat_description	description of the stat
+@param[in,out]	trx			transaction
+@return DB_SUCCESS or error code */
+dberr_t
+dict_stats_save_index_stat(
+	dict_index_t*	index,
+	time_t		last_update,
+	const char*	stat_name,
+	ib_uint64_t	stat_value,
+	ib_uint64_t*	sample_size,
+	const char*	stat_description,
+	trx_t*		trx)
+	MY_ATTRIBUTE((nonnull(1, 3, 6, 7)));
+
+/** Report an error if updating table statistics failed because
+.ibd file is missing, table decryption failed or table is corrupted.
+@param[in,out]	table	Table
+@param[in]	defragment	true if statistics is for defragment
+@retval DB_DECRYPTION_FAILED if decryption of the table failed
+@retval DB_TABLESPACE_DELETED if .ibd file is missing
+@retval DB_CORRUPTION if table is marked as corrupted */
+dberr_t
+dict_stats_report_error(dict_table_t* table, bool defragment = false)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#include "dict0stats.inl"
+
+#ifdef UNIV_ENABLE_UNIT_TEST_DICT_STATS
+void test_dict_stats_all();
+#endif /* UNIV_ENABLE_UNIT_TEST_DICT_STATS */
+
+#endif /* dict0stats_h */
diff --git a/storage/innobase/include/dict0stats.inl b/storage/innobase/include/dict0stats.inl
new file mode 100644
index 00000000..dd516275
--- /dev/null
+++ b/storage/innobase/include/dict0stats.inl
@@ -0,0 +1,219 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats.ic
+Code used for calculating and manipulating table statistics.
+
+Created Jan 23, 2012 Vasil Dimov
+*******************************************************/
+
+#include "dict0dict.h"
+#include "srv0srv.h"
+
+/*********************************************************************//**
+Set the persistent statistics flag for a given table. This is set only
+in the in-memory table object and is not saved on disk. It will be read
+from the .frm file upon first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_set_persistent(
+/*======================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	ibool		ps_on,	/*!< in: persistent stats explicitly enabled */
+	ibool		ps_off)	/*!< in: persistent stats explicitly disabled */
+{
+	/* Not allowed to have both flags set, but a CREATE or ALTER
+	statement that contains "STATS_PERSISTENT=0 STATS_PERSISTENT=1" would
+	end up having both set. In this case we clear the OFF flag. */
+	if (ps_on && ps_off) {
+		ps_off = FALSE;
+	}
+
+	ib_uint32_t	stat_persistent = 0;
+
+	if (ps_on) {
+		stat_persistent |= DICT_STATS_PERSISTENT_ON;
+	}
+
+	if (ps_off) {
+		stat_persistent |= DICT_STATS_PERSISTENT_OFF;
+	}
+
+	/* we rely on this assignment to be atomic */
+	table->stat_persistent = stat_persistent;
+}
+
+/** @return whether persistent statistics is enabled for a given table */
+UNIV_INLINE
+bool
+dict_stats_is_persistent_enabled(const dict_table_t* table)
+{
+	/* Because of the nature of this check (non-locking) it is possible
+	that a table becomes:
+	* PS-disabled immediately after this function has returned TRUE or
+	* PS-enabled immediately after this function has returned FALSE.
+	This means that it is possible that we do:
+	+ dict_stats_update(DICT_STATS_RECALC_PERSISTENT) on a table that has
+	  just been PS-disabled or
+	+ dict_stats_update(DICT_STATS_RECALC_TRANSIENT) on a table that has
+	  just been PS-enabled.
+	This is acceptable. Avoiding this would mean that we would have to
+	hold dict_sys.latch or stats_mutex_lock() like for accessing the
+	other ::stat_ members which would be too big performance penalty,
+	especially when this function is called from
+	dict_stats_update_if_needed(). */
+
+	/* we rely on this read to be atomic */
+	ib_uint32_t	stat_persistent = table->stat_persistent;
+
+	if (stat_persistent & DICT_STATS_PERSISTENT_ON) {
+		ut_ad(!(stat_persistent & DICT_STATS_PERSISTENT_OFF));
+		return(true);
+	} else if (stat_persistent & DICT_STATS_PERSISTENT_OFF) {
+		return(false);
+	} else {
+		return(srv_stats_persistent);
+	}
+}
+
+/*********************************************************************//**
+Set the auto recalc flag for a given table (only honored for a persistent
+stats enabled table). The flag is set only in the in-memory table object
+and is not saved in InnoDB files. It will be read from the .frm file upon
+first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_auto_recalc_set(
+/*=======================*/
+	dict_table_t*	table,			/*!< in/out: table */
+	ibool		auto_recalc_on,		/*!< in: explicitly enabled */
+	ibool		auto_recalc_off)	/*!< in: explicitly disabled */
+{
+	ut_ad(!auto_recalc_on || !auto_recalc_off);
+
+	ib_uint32_t	stats_auto_recalc = 0;
+
+	if (auto_recalc_on) {
+		stats_auto_recalc |= DICT_STATS_AUTO_RECALC_ON;
+	}
+
+	if (auto_recalc_off) {
+		stats_auto_recalc |= DICT_STATS_AUTO_RECALC_OFF;
+	}
+
+	/* we rely on this assignment to be atomic */
+	table->stats_auto_recalc = stats_auto_recalc;
+}
+
+/** @return whether auto recalc is enabled for a given table*/
+UNIV_INLINE
+bool
+dict_stats_auto_recalc_is_enabled(const dict_table_t* table)
+{
+	/* we rely on this read to be atomic */
+	ib_uint32_t	stats_auto_recalc = table->stats_auto_recalc;
+
+	if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_ON) {
+		ut_ad(!(stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF));
+		return(true);
+	} else if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF) {
+		return(false);
+	} else {
+		return(srv_stats_auto_recalc);
+	}
+}
+
+/*********************************************************************//**
+Initialize table's stats for the first time when opening a table. */
+UNIV_INLINE
+void
+dict_stats_init(
+/*============*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	ut_ad(!table->stats_mutex_is_owner());
+
+	if (table->stat_initialized) {
+		return;
+	}
+
+	dict_stats_upd_option_t	opt;
+
+	if (dict_stats_is_persistent_enabled(table)) {
+		opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY;
+	} else {
+		opt = DICT_STATS_RECALC_TRANSIENT;
+	}
+
+	dict_stats_update(table, opt);
+}
+
+/*********************************************************************//**
+Deinitialize table's stats after the last close of the table. This is
+used to detect "FLUSH TABLE" and refresh the stats upon next open. */
+UNIV_INLINE
+void
+dict_stats_deinit(
+/*==============*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	ut_ad(table->stats_mutex_is_owner());
+	ut_ad(table->get_ref_count() == 0);
+
+#ifdef HAVE_valgrind
+	if (!table->stat_initialized) {
+		return;
+	}
+
+	MEM_UNDEFINED(&table->stat_n_rows, sizeof table->stat_n_rows);
+	MEM_UNDEFINED(&table->stat_clustered_index_size,
+		      sizeof table->stat_clustered_index_size);
+	MEM_UNDEFINED(&table->stat_sum_of_other_index_sizes,
+		      sizeof table->stat_sum_of_other_index_sizes);
+	MEM_UNDEFINED(&table->stat_modified_counter,
+		      sizeof table->stat_modified_counter);
+
+	dict_index_t*   index;
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+		MEM_UNDEFINED(
+			index->stat_n_diff_key_vals,
+			index->n_uniq
+			* sizeof index->stat_n_diff_key_vals[0]);
+		MEM_UNDEFINED(
+			index->stat_n_sample_sizes,
+			index->n_uniq
+			* sizeof index->stat_n_sample_sizes[0]);
+		MEM_UNDEFINED(
+			index->stat_n_non_null_key_vals,
+			index->n_uniq
+			* sizeof index->stat_n_non_null_key_vals[0]);
+		MEM_UNDEFINED(
+			&index->stat_index_size,
+			sizeof(index->stat_index_size));
+		MEM_UNDEFINED(
+			&index->stat_n_leaf_pages,
+			sizeof(index->stat_n_leaf_pages));
+	}
+#endif /* HAVE_valgrind */
+	table->stat_initialized = FALSE;
+}
diff --git a/storage/innobase/include/dict0stats_bg.h b/storage/innobase/include/dict0stats_bg.h
new file mode 100644
index 00000000..d9a2f628
--- /dev/null
+++ b/storage/innobase/include/dict0stats_bg.h
@@ -0,0 +1,59 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats_bg.h
+Code used for background table and index stats gathering.
+
+Created Apr 26, 2012 Vasil Dimov
+*******************************************************/
+
+#ifndef dict0stats_bg_h
+#define dict0stats_bg_h
+
+#include "dict0types.h"
+
+#ifdef HAVE_PSI_INTERFACE
+extern mysql_pfs_key_t	recalc_pool_mutex_key;
+#endif /* HAVE_PSI_INTERFACE */
+
+/** Delete a table from the auto recalc pool, and ensure that
+no statistics are being updated on it. */
+void dict_stats_recalc_pool_del(table_id_t id, bool have_mdl_exclusive);
+
+/*****************************************************************//**
+Initialize global variables needed for the operation of dict_stats_thread().
+Must be called before dict_stats task is started. */
+void dict_stats_init();
+
+/*****************************************************************//**
+Free resources allocated by dict_stats_thread_init(), must be called
+after dict_stats task has exited. */
+void dict_stats_deinit();
+
+/** Start the dict stats timer. */
+void dict_stats_start();
+
+/** Shut down the dict_stats timer. */
+void dict_stats_shutdown();
+
+/** Reschedule dict stats timer to run now. */
+void dict_stats_schedule_now();
+
+#endif /* dict0stats_bg_h */
diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h
new file mode 100644
index 00000000..ec50e8cd
--- /dev/null
+++ b/storage/innobase/include/dict0types.h
@@ -0,0 +1,176 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0types.h
+Data dictionary global types
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0types_h
+#define dict0types_h
+
+#include "univ.i"
+#include "span.h"
+#include <rem0types.h>
+
+using st_::span;
+
+struct dict_col_t;
+struct dict_field_t;
+struct dict_index_t;
+struct dict_table_t;
+struct dict_foreign_t;
+struct dict_v_col_t;
+
+struct ind_node_t;
+struct tab_node_t;
+struct dict_add_v_col_t;
+
+/* Space id and page no where the dictionary header resides */
+#define	DICT_HDR_SPACE		0	/* the SYSTEM tablespace */
+#define	DICT_HDR_PAGE_NO	FSP_DICT_HDR_PAGE_NO
+
+/* The ibuf table and indexes's ID are assigned as the number
+DICT_IBUF_ID_MIN plus the space id */
+#define DICT_IBUF_ID_MIN	0xFFFFFFFF00000000ULL
+
+typedef ib_id_t		table_id_t;
+typedef ib_id_t		index_id_t;
+
+/** Maximum transaction identifier */
+#define TRX_ID_MAX	IB_ID_MAX
+
+/** The bit pattern corresponding to TRX_ID_MAX */
+extern const byte trx_id_max_bytes[8];
+extern const byte timestamp_max_bytes[7];
+
+/** Error to ignore when we load table dictionary into memory. However,
+the table and index will be marked as "corrupted", and caller will
+be responsible to deal with corrupted table or index.
+Note: please define the IGNORE_ERR_* as bits, so their value can
+be or-ed together */
+enum dict_err_ignore_t {
+	DICT_ERR_IGNORE_NONE = 0,	/*!< no error to ignore */
+	DICT_ERR_IGNORE_FK_NOKEY = 1,	/*!< ignore error if any foreign
+					key is missing */
+	DICT_ERR_IGNORE_INDEX = 2,	/*!< ignore corrupted indexes */
+	DICT_ERR_IGNORE_RECOVER_LOCK = 4 | DICT_ERR_IGNORE_FK_NOKEY,
+					/*!< Used when recovering table locks
+					for resurrected transactions.
+					Silently load a missing
+					tablespace, and do not load
+					incomplete index definitions. */
+	/** ignore all errors above */
+	DICT_ERR_IGNORE_ALL = 7,
+	/** prepare some DDL operation;
+	do not attempt to load tablespace */
+	DICT_ERR_IGNORE_TABLESPACE = 15,
+	/** prepare to drop the table; do not attempt to load tablespace
+	or the metadata */
+	DICT_ERR_IGNORE_DROP = 31
+};
+
+/** Quiescing states for flushing tables to disk. */
+enum ib_quiesce_t {
+	QUIESCE_NONE,
+	QUIESCE_START,			/*!< Initialise, prepare to start */
+	QUIESCE_COMPLETE		/*!< All done */
+};
+
+/** Prefix for InnoDB internal tables, adopted from sql/table.h */
+#define TEMP_FILE_PREFIX_INNODB		"#sql-ib"
+
+/** Table name wrapper for pretty-printing */
+struct table_name_t
+{
+	/** The name in internal representation */
+	char*	m_name;
+
+	/** Default constructor */
+	table_name_t() = default;
+	/** Constructor */
+	table_name_t(char* name) : m_name(name) {}
+
+	/** @return the end of the schema name */
+	const char* dbend() const
+	{
+		const char* sep = strchr(m_name, '/');
+		ut_ad(sep);
+		return sep;
+	}
+
+	/** @return the length of the schema name, in bytes */
+	size_t dblen() const { return size_t(dbend() - m_name); }
+
+	/** Determine the filename-safe encoded table name.
+	@return	the filename-safe encoded table name */
+	const char* basename() const { return dbend() + 1; }
+
+	/** The start of the table basename suffix for partitioned tables */
+	static const char part_suffix[4];
+
+	/** Determine the partition or subpartition name suffix.
+	@return the partition name
+	@retval	NULL	if the table is not partitioned */
+	const char* part() const { return strstr(basename(), part_suffix); }
+
+	/** @return whether this is a temporary or intermediate table name */
+	inline bool is_temporary() const;
+};
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+/** Dump the change buffer at startup */
+extern my_bool		ibuf_dump;
+/** Flag to control insert buffer debugging. */
+extern uint		ibuf_debug;
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+/** Shift for spatial status */
+#define SPATIAL_STATUS_SHIFT	12
+
+/** Mask to encode/decode spatial status. */
+#define SPATIAL_STATUS_MASK	(3U << SPATIAL_STATUS_SHIFT)
+
+#if SPATIAL_STATUS_MASK < REC_VERSION_56_MAX_INDEX_COL_LEN
+# error SPATIAL_STATUS_MASK < REC_VERSION_56_MAX_INDEX_COL_LEN
+#endif
+
+/** whether a col is used in spatial index or regular index
+Note: the spatial status is part of persistent undo log,
+so we should not modify the values in MySQL 5.7 */
+enum spatial_status_t {
+	/* Unkown status (undo format in 5.7.9) */
+	SPATIAL_UNKNOWN = 0,
+
+	/** Not used in gis index. */
+	SPATIAL_NONE	= 1,
+
+	/** Used in both spatial index and regular index. */
+	SPATIAL_MIXED	= 2,
+
+	/** Only used in spatial index. */
+	SPATIAL_ONLY	= 3
+};
+
+#define TABLE_STATS_NAME "mysql/innodb_table_stats"
+#define INDEX_STATS_NAME "mysql/innodb_index_stats"
+
+#endif
diff --git a/storage/innobase/include/dyn0buf.h b/storage/innobase/include/dyn0buf.h
new file mode 100644
index 00000000..06af4dcc
--- /dev/null
+++ b/storage/innobase/include/dyn0buf.h
@@ -0,0 +1,442 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dyn0buf.h
+The dynamically allocated buffer implementation
+
+Created 2013-03-16 Sunny Bains
+*******************************************************/
+
+#ifndef dyn0buf_h
+#define dyn0buf_h
+
+#include "mem0mem.h"
+#include "dyn0types.h"
+#include "ilist.h"
+
+
+/** Class that manages dynamic buffers. It uses a UT_LIST of
+mtr_buf_t::block_t instances. We don't use STL containers in
+order to avoid the overhead of heap calls. Using a custom memory
+allocator doesn't solve the problem either because we have to get
+the memory from somewhere. We can't use the block_t::m_data as the
+backend for the custom allocator because we would like the data in
+the blocks to be contiguous. */
+class mtr_buf_t {
+public:
+	/** SIZE - sizeof(m_node) + sizeof(m_used) */
+	enum { MAX_DATA_SIZE = DYN_ARRAY_DATA_SIZE
+	       - sizeof(ilist_node<>) + sizeof(uint32_t) };
+
+	class block_t : public ilist_node<> {
+	public:
+
+		block_t()
+		{
+			compile_time_assert(MAX_DATA_SIZE <= (2 << 15));
+			init();
+		}
+
+		/**
+		Gets the number of used bytes in a block.
+		@return	number of bytes used */
+		ulint used() const
+			MY_ATTRIBUTE((warn_unused_result))
+		{
+			return(static_cast<ulint>(m_used & ~DYN_BLOCK_FULL_FLAG));
+		}
+
+		/**
+		Gets pointer to the start of data.
+		@return	pointer to data */
+		byte* start()
+			MY_ATTRIBUTE((warn_unused_result))
+		{
+			return(m_data);
+		}
+
+		/**
+		@return start of data - non const version */
+		byte* begin()
+			MY_ATTRIBUTE((warn_unused_result))
+		{
+			return(m_data);
+		}
+
+		/**
+		@return end of used data - non const version */
+		byte* end()
+			MY_ATTRIBUTE((warn_unused_result))
+		{
+			return(begin() + m_used);
+		}
+
+		/**
+		@return start of data - const version */
+		const byte* begin() const
+			MY_ATTRIBUTE((warn_unused_result))
+		{
+			return(m_data);
+		}
+
+		/**
+		@return end of used data - const version */
+		const byte* end() const
+			MY_ATTRIBUTE((warn_unused_result))
+		{
+			return(begin() + m_used);
+		}
+
+	private:
+		/**
+		@return pointer to start of reserved space */
+		template <typename Type>
+		Type push(uint32_t size)
+		{
+			Type	ptr = reinterpret_cast<Type>(end());
+
+			m_used += size;
+			ut_ad(m_used <= uint32_t(MAX_DATA_SIZE));
+
+			return(ptr);
+		}
+
+		/**
+		Grow the stack. */
+		void close(const byte* ptr)
+		{
+			/* Check that it is within bounds */
+			ut_ad(ptr >= begin());
+			ut_ad(ptr <= begin() + m_buf_end);
+
+			/* We have done the boundary check above */
+			m_used = uint32_t(ptr - begin());
+
+			ut_ad(m_used <= MAX_DATA_SIZE);
+			ut_d(m_buf_end = 0);
+		}
+
+		/**
+		Initialise the block */
+		void init()
+		{
+			m_used = 0;
+			ut_d(m_buf_end = 0);
+			ut_d(m_magic_n = DYN_BLOCK_MAGIC_N);
+		}
+	private:
+#ifdef UNIV_DEBUG
+		/** If opened then this is the buffer end offset, else 0 */
+		ulint		m_buf_end;
+
+		/** Magic number (DYN_BLOCK_MAGIC_N) */
+		ulint		m_magic_n;
+#endif /* UNIV_DEBUG */
+
+		/** Storage */
+		byte		m_data[MAX_DATA_SIZE];
+
+		/** number of data bytes used in this block;
+		DYN_BLOCK_FULL_FLAG is set when the block becomes full */
+		uint32_t	m_used;
+
+		friend class mtr_buf_t;
+	};
+
+	typedef sized_ilist<block_t> list_t;
+
+	/** Default constructor */
+	mtr_buf_t()
+		:
+		m_heap(),
+		m_size()
+	{
+		push_back(&m_first_block);
+	}
+
+	/** Destructor */
+	~mtr_buf_t()
+	{
+		erase();
+	}
+
+	/** Reset the buffer vector */
+	void erase()
+	{
+		if (m_heap != NULL) {
+			mem_heap_free(m_heap);
+			m_heap = NULL;
+
+			/* Initialise the list and add the first block. */
+			m_list.clear();
+			m_list.push_back(m_first_block);
+		} else {
+			m_first_block.init();
+			ut_ad(m_list.size() == 1);
+		}
+
+		m_size = 0;
+	}
+
+	/**
+	Makes room on top and returns a pointer to a buffer in it. After
+	copying the elements, the caller must close the buffer using close().
+	@param size	in bytes of the buffer; MUST be <= MAX_DATA_SIZE!
+	@return	pointer to the buffer */
+	byte* open(ulint size)
+		MY_ATTRIBUTE((warn_unused_result))
+	{
+		ut_ad(size > 0);
+		ut_ad(size <= MAX_DATA_SIZE);
+
+		block_t*	block;
+
+		block = has_space(size) ? back() : add_block();
+
+		ut_ad(block->m_used <= MAX_DATA_SIZE);
+		ut_d(block->m_buf_end = block->m_used + size);
+
+		return(block->end());
+	}
+
+	/**
+	Closes the buffer returned by open.
+	@param ptr	end of used space */
+	void close(const byte* ptr)
+	{
+		ut_ad(!m_list.empty());
+		block_t*	block = back();
+
+		m_size -= block->used();
+
+		block->close(ptr);
+
+		m_size += block->used();
+	}
+
+	/**
+	Makes room on top and returns a pointer to the added element.
+	The caller must copy the element to the pointer returned.
+	@param size	in bytes of the element
+	@return	pointer to the element */
+	template <typename Type>
+	Type push(uint32_t size)
+	{
+		ut_ad(size > 0);
+		ut_ad(size <= MAX_DATA_SIZE);
+
+		block_t*	block;
+
+		block = has_space(size) ? back() : add_block();
+
+		m_size += size;
+
+		/* See ISO C++03 14.2/4 for why "template" is required. */
+
+		return(block->template push<Type>(size));
+	}
+
+	/**
+	Pushes n bytes.
+	@param str	string to write
+	@param len	string length */
+	void push(const byte* ptr, uint32_t len)
+	{
+		while (len > 0) {
+			uint32_t n_copied = std::min(len,
+						     uint32_t(MAX_DATA_SIZE));
+			::memmove(push<byte*>(n_copied), ptr, n_copied);
+
+			ptr += n_copied;
+			len -= n_copied;
+		}
+	}
+
+	/**
+	Returns a pointer to an element in the buffer. const version.
+	@param pos	position of element in bytes from start
+	@return	pointer to element */
+	template <typename Type>
+	const Type at(ulint pos) const
+	{
+		block_t*	block = const_cast<block_t*>(
+			const_cast<mtr_buf_t*>(this)->find(pos));
+
+		return(reinterpret_cast<Type>(block->begin() + pos));
+	}
+
+	/**
+	Returns a pointer to an element in the buffer. non const version.
+	@param pos	position of element in bytes from start
+	@return	pointer to element */
+	template <typename Type>
+	Type at(ulint pos)
+	{
+		block_t*	block = const_cast<block_t*>(find(pos));
+
+		return(reinterpret_cast<Type>(block->begin() + pos));
+	}
+
+	/**
+	Returns the size of the total stored data.
+	@return	data size in bytes */
+	ulint size() const
+		MY_ATTRIBUTE((warn_unused_result))
+	{
+#ifdef UNIV_DEBUG
+		ulint	total_size = 0;
+
+		for (list_t::iterator it = m_list.begin(), end = m_list.end();
+		     it != end; ++it) {
+			total_size += it->used();
+		}
+
+		ut_ad(total_size == m_size);
+#endif /* UNIV_DEBUG */
+		return(m_size);
+	}
+
+	/**
+	Iterate over each block and call the functor.
+	@return	false if iteration was terminated. */
+	template <typename Functor>
+	bool for_each_block(const Functor& functor) const
+	{
+		for (list_t::iterator it = m_list.begin(), end = m_list.end();
+		     it != end; ++it) {
+
+			if (!functor(&*it)) {
+				return false;
+			}
+		}
+
+		return(true);
+	}
+
+	/**
+	@return the first block */
+	block_t* front()
+		MY_ATTRIBUTE((warn_unused_result))
+	{
+		return &m_list.front();
+	}
+
+	/**
+	@return true if m_first_block block was not filled fully */
+	bool is_small() const
+		MY_ATTRIBUTE((warn_unused_result))
+	{
+		return(m_heap == NULL);
+	}
+
+	/** @return whether the buffer is empty */
+	bool empty() const { return !back()->m_used; }
+
+private:
+	// Disable copying
+	mtr_buf_t(const mtr_buf_t&);
+	mtr_buf_t& operator=(const mtr_buf_t&);
+
+	/**
+	Add the block to the end of the list*/
+	void push_back(block_t* block)
+	{
+		block->init();
+		m_list.push_back(*block);
+	}
+
+	/** @return the last block in the list */
+	block_t* back() const
+	{
+		return &const_cast<block_t&>(m_list.back());
+	}
+
+	/*
+	@return true if request can be fullfilled */
+	bool has_space(ulint size) const
+	{
+		return(back()->m_used + size <= MAX_DATA_SIZE);
+	}
+
+	/*
+	@return true if request can be fullfilled */
+	bool has_space(ulint size)
+	{
+		return(back()->m_used + size <= MAX_DATA_SIZE);
+	}
+
+	/** Find the block that contains the pos.
+	@param pos	absolute offset, it is updated to make it relative
+			to the block
+	@return the block containing the pos. */
+	block_t* find(ulint& pos)
+	{
+		ut_ad(!m_list.empty());
+
+		for (list_t::iterator it = m_list.begin(), end = m_list.end();
+		     it != end; ++it) {
+
+			if (pos < it->used()) {
+				ut_ad(it->used() >= pos);
+
+				return &*it;
+			}
+
+			pos -= it->used();
+		}
+
+		return NULL;
+	}
+
+	/**
+	Allocate and add a new block to m_list */
+	block_t* add_block()
+	{
+		block_t*	block;
+
+		if (m_heap == NULL) {
+			m_heap = mem_heap_create(sizeof(*block));
+		}
+
+		block = reinterpret_cast<block_t*>(
+			mem_heap_alloc(m_heap, sizeof(*block)));
+
+		push_back(block);
+
+		return(block);
+	}
+
+private:
+	/** Heap to use for memory allocation */
+	mem_heap_t*		m_heap;
+
+	/** Allocated blocks */
+	list_t			m_list;
+
+	/** Total size used by all blocks */
+	ulint			m_size;
+
+	/** The default block, should always be the first element. This
+	is for backwards compatibility and to avoid an extra heap allocation
+	for small REDO log records */
+	block_t			m_first_block;
+};
+
+#endif /* dyn0buf_h */
diff --git a/storage/innobase/include/dyn0types.h b/storage/innobase/include/dyn0types.h
new file mode 100644
index 00000000..83d0b0d6
--- /dev/null
+++ b/storage/innobase/include/dyn0types.h
@@ -0,0 +1,39 @@
+/*****************************************************************************
+
+Copyright (c) 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dyn0types.h
+The dynamically allocated buffer types and constants
+
+Created 2013-03-16 Sunny Bains
+*******************************************************/
+
+#ifndef dyn0types_h
+#define dyn0types_h
+
+/** Value of dyn_block_t::magic_n */
+#define DYN_BLOCK_MAGIC_N	375767
+
+/** This is the initial 'payload' size of a dynamic array */
+#define	DYN_ARRAY_DATA_SIZE	512
+
+/** Flag for dyn_block_t::used that indicates a full block */
+#define DYN_BLOCK_FULL_FLAG	0x1000000UL
+
+#endif /* dyn0types_h */
diff --git a/storage/innobase/include/eval0eval.h b/storage/innobase/include/eval0eval.h
new file mode 100644
index 00000000..a3ea0462
--- /dev/null
+++ b/storage/innobase/include/eval0eval.h
@@ -0,0 +1,109 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0eval.h
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef eval0eval_h
+#define eval0eval_h
+
+#include "que0types.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+
+/*****************************************************************//**
+Free the buffer from global dynamic memory for a value of a que_node,
+if it has been allocated in the above function. The freeing for pushed
+column values is done in sel_col_prefetch_buf_free. */
+void
+eval_node_free_val_buf(
+/*===================*/
+	que_node_t*	node);	/*!< in: query graph node */
+/*****************************************************************//**
+Evaluates a symbol table symbol. */
+UNIV_INLINE
+void
+eval_sym(
+/*=====*/
+	sym_node_t*	sym_node);	/*!< in: symbol table node */
+/*****************************************************************//**
+Evaluates an expression. */
+UNIV_INLINE
+void
+eval_exp(
+/*=====*/
+	que_node_t*	exp_node);	/*!< in: expression */
+/*****************************************************************//**
+Sets an integer value as the value of an expression node. */
+UNIV_INLINE
+void
+eval_node_set_int_val(
+/*==================*/
+	que_node_t*	node,	/*!< in: expression node */
+	lint		val);	/*!< in: value to set */
+/*****************************************************************//**
+Gets an integer value from an expression node.
+@return integer value */
+UNIV_INLINE
+lint
+eval_node_get_int_val(
+/*==================*/
+	que_node_t*	node);	/*!< in: expression node */
+/*****************************************************************//**
+Copies a binary string value as the value of a query graph node. Allocates a
+new buffer if necessary. */
+UNIV_INLINE
+void
+eval_node_copy_and_alloc_val(
+/*=========================*/
+	que_node_t*	node,	/*!< in: query graph node */
+	const byte*	str,	/*!< in: binary string */
+	ulint		len);	/*!< in: string length or UNIV_SQL_NULL */
+/*****************************************************************//**
+Copies a query node value to another node. */
+UNIV_INLINE
+void
+eval_node_copy_val(
+/*===============*/
+	que_node_t*	node1,	/*!< in: node to copy to */
+	que_node_t*	node2);	/*!< in: node to copy from */
+/*****************************************************************//**
+Gets a iboolean value from a query node.
+@return iboolean value */
+UNIV_INLINE
+ibool
+eval_node_get_ibool_val(
+/*====================*/
+	que_node_t*	node);	/*!< in: query graph node */
+/*****************************************************************//**
+Evaluates a comparison node.
+@return the result of the comparison */
+ibool
+eval_cmp(
+/*=====*/
+	func_node_t*	cmp_node);	/*!< in: comparison node */
+
+
+#include "eval0eval.inl"
+
+#endif
diff --git a/storage/innobase/include/eval0eval.inl b/storage/innobase/include/eval0eval.inl
new file mode 100644
index 00000000..0ea4057f
--- /dev/null
+++ b/storage/innobase/include/eval0eval.inl
@@ -0,0 +1,254 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0eval.ic
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#include "que0que.h"
+#include "rem0cmp.h"
+#include "pars0grm.h"
+
+/*****************************************************************//**
+Evaluates a function node. */
+void
+eval_func(
+/*======*/
+	func_node_t*	func_node);	/*!< in: function node */
+/*****************************************************************//**
+Allocate a buffer from global dynamic memory for a value of a que_node.
+NOTE that this memory must be explicitly freed when the query graph is
+freed. If the node already has allocated buffer, that buffer is freed
+here. NOTE that this is the only function where dynamic memory should be
+allocated for a query node val field.
+@return pointer to allocated buffer */
+byte*
+eval_node_alloc_val_buf(
+/*====================*/
+	que_node_t*	node,	/*!< in: query graph node; sets the val field
+				data field to point to the new buffer, and
+				len field equal to size */
+	ulint		size);	/*!< in: buffer size */
+
+
+/*****************************************************************//**
+Allocates a new buffer if needed.
+@return pointer to buffer */
+UNIV_INLINE
+byte*
+eval_node_ensure_val_buf(
+/*=====================*/
+	que_node_t*	node,	/*!< in: query graph node; sets the val field
+				data field to point to the new buffer, and
+				len field equal to size */
+	ulint		size)	/*!< in: buffer size */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	dfield = que_node_get_val(node);
+	dfield_set_len(dfield, size);
+
+	data = static_cast<byte*>(dfield_get_data(dfield));
+
+	if (!data || que_node_get_val_buf_size(node) < size) {
+
+		data = eval_node_alloc_val_buf(node, size);
+	}
+
+	return(data);
+}
+
+/*****************************************************************//**
+Evaluates a symbol table symbol. */
+UNIV_INLINE
+void
+eval_sym(
+/*=====*/
+	sym_node_t*	sym_node)	/*!< in: symbol table node */
+{
+
+	ut_ad(que_node_get_type(sym_node) == QUE_NODE_SYMBOL);
+
+	if (sym_node->indirection) {
+		/* The symbol table node is an alias for a variable or a
+		column */
+
+		dfield_copy_data(que_node_get_val(sym_node),
+				 que_node_get_val(sym_node->indirection));
+	}
+}
+
+/*****************************************************************//**
+Evaluates an expression. */
+UNIV_INLINE
+void
+eval_exp(
+/*=====*/
+	que_node_t*	exp_node)	/*!< in: expression */
+{
+	if (que_node_get_type(exp_node) == QUE_NODE_SYMBOL) {
+
+		eval_sym((sym_node_t*) exp_node);
+
+		return;
+	}
+
+	eval_func(static_cast<func_node_t*>(exp_node));
+}
+
+/*****************************************************************//**
+Sets an integer value as the value of an expression node. */
+UNIV_INLINE
+void
+eval_node_set_int_val(
+/*==================*/
+	que_node_t*	node,	/*!< in: expression node */
+	lint		val)	/*!< in: value to set */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	dfield = que_node_get_val(node);
+
+	data = static_cast<byte*>(dfield_get_data(dfield));
+
+	if (data == NULL) {
+		data = eval_node_alloc_val_buf(node, 4);
+	}
+
+	ut_ad(dfield_get_len(dfield) == 4);
+
+	mach_write_to_4(data, (ulint) val);
+}
+
+/*****************************************************************//**
+Gets an integer non-SQL null value from an expression node.
+@return integer value */
+UNIV_INLINE
+lint
+eval_node_get_int_val(
+/*==================*/
+	que_node_t*	node)	/*!< in: expression node */
+{
+	const byte*	ptr;
+	dfield_t*	dfield;
+
+	dfield = que_node_get_val(node);
+	ptr = static_cast<byte*>(dfield_get_data(dfield));
+
+	ut_ad(dfield_get_len(dfield) == 4);
+
+	return((int) mach_read_from_4(ptr));
+}
+
+/*****************************************************************//**
+Gets a iboolean value from a query node.
+@return iboolean value */
+UNIV_INLINE
+ibool
+eval_node_get_ibool_val(
+/*====================*/
+	que_node_t*	node)	/*!< in: query graph node */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	dfield = que_node_get_val(node);
+
+	data = static_cast<byte*>(dfield_get_data(dfield));
+
+	ut_ad(data != NULL);
+
+	return(mach_read_from_1(data));
+}
+
+/*****************************************************************//**
+Sets a iboolean value as the value of a function node. */
+UNIV_INLINE
+void
+eval_node_set_ibool_val(
+/*====================*/
+	func_node_t*	func_node,	/*!< in: function node */
+	ibool		val)		/*!< in: value to set */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	dfield = que_node_get_val(func_node);
+
+	data = static_cast<byte*>(dfield_get_data(dfield));
+
+	if (data == NULL) {
+		/* Allocate 1 byte to hold the value */
+
+		data = eval_node_alloc_val_buf(func_node, 1);
+	}
+
+	ut_ad(dfield_get_len(dfield) == 1);
+
+	mach_write_to_1(data, val);
+}
+
+/*****************************************************************//**
+Copies a binary string value as the value of a query graph node. Allocates a
+new buffer if necessary. */
+UNIV_INLINE
+void
+eval_node_copy_and_alloc_val(
+/*=========================*/
+	que_node_t*	node,	/*!< in: query graph node */
+	const byte*	str,	/*!< in: binary string */
+	ulint		len)	/*!< in: string length or UNIV_SQL_NULL */
+{
+	byte*		data;
+
+	if (len == UNIV_SQL_NULL) {
+		dfield_set_len(que_node_get_val(node), len);
+
+		return;
+	}
+
+	data = eval_node_ensure_val_buf(node, len);
+
+	memcpy(data, str, len);
+}
+
+/*****************************************************************//**
+Copies a query node value to another node. */
+UNIV_INLINE
+void
+eval_node_copy_val(
+/*===============*/
+	que_node_t*	node1,	/*!< in: node to copy to */
+	que_node_t*	node2)	/*!< in: node to copy from */
+{
+	dfield_t*	dfield2;
+
+	dfield2 = que_node_get_val(node2);
+
+	eval_node_copy_and_alloc_val(
+		node1,
+		static_cast<byte*>(dfield_get_data(dfield2)),
+		dfield_get_len(dfield2));
+}
diff --git a/storage/innobase/include/eval0proc.h b/storage/innobase/include/eval0proc.h
new file mode 100644
index 00000000..a93140bf
--- /dev/null
+++ b/storage/innobase/include/eval0proc.h
@@ -0,0 +1,94 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0proc.h
+Executes SQL stored procedures and their control structures
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#ifndef eval0proc_h
+#define eval0proc_h
+
+#include "que0types.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+
+/**********************************************************************//**
+Performs an execution step of a procedure node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_step(
+/*======*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an if-statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+if_step(
+/*====*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a while-statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+while_step(
+/*=======*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a for-loop node.
+@return query thread to run next or NULL */
+que_thr_t*
+for_step(
+/*=====*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an assignment statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+assign_step(
+/*========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a procedure call node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_eval_step(
+/*===========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an exit statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+exit_step(
+/*======*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a return-statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+return_step(
+/*========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+#include "eval0proc.inl"
+
+#endif
diff --git a/storage/innobase/include/eval0proc.inl b/storage/innobase/include/eval0proc.inl
new file mode 100644
index 00000000..b0c5f75b
--- /dev/null
+++ b/storage/innobase/include/eval0proc.inl
@@ -0,0 +1,88 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0proc.ic
+Executes SQL stored procedures and their control structures
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#include "pars0pars.h"
+#include "que0que.h"
+#include "eval0eval.h"
+
+/**********************************************************************//**
+Performs an execution step of a procedure node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_step(
+/*======*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	proc_node_t*	node;
+
+	ut_ad(thr);
+
+	node = static_cast<proc_node_t*>(thr->run_node);
+	ut_ad(que_node_get_type(node) == QUE_NODE_PROC);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		/* Start execution from the first statement in the statement
+		list */
+
+		thr->run_node = node->stat_list;
+	} else {
+		/* Move to the next statement */
+		ut_ad(que_node_get_next(thr->prev_node) == NULL);
+
+		thr->run_node = NULL;
+	}
+
+	if (thr->run_node == NULL) {
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a procedure call node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_eval_step(
+/*===========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	func_node_t*	node;
+
+	ut_ad(thr);
+
+	node = static_cast<func_node_t*>(thr->run_node);
+	ut_ad(que_node_get_type(node) == QUE_NODE_FUNC);
+
+	/* Evaluate the procedure */
+
+	eval_exp(node);
+
+	thr->run_node = que_node_get_parent(node);
+
+	return(thr);
+}
diff --git a/storage/innobase/include/fil0crypt.h b/storage/innobase/include/fil0crypt.h
new file mode 100644
index 00000000..f43965cd
--- /dev/null
+++ b/storage/innobase/include/fil0crypt.h
@@ -0,0 +1,396 @@
+/*****************************************************************************
+Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fil0crypt.h
+The low-level file system encryption support functions
+
+Created 04/01/2015 Jan Lindström
+*******************************************************/
+
+#ifndef fil0crypt_h
+#define fil0crypt_h
+
+#include "my_crypt.h"
+#include "fil0fil.h"
+
+/**
+* Magic pattern in start of crypt data on page 0
+*/
+#define MAGIC_SZ 6
+
+static const unsigned char CRYPT_MAGIC[MAGIC_SZ] = {
+	's', 0xE, 0xC, 'R', 'E', 't' };
+
+/* This key will be used if nothing else is given */
+#define FIL_DEFAULT_ENCRYPTION_KEY ENCRYPTION_KEY_SYSTEM_DATA
+
+/** Wake up the encryption threads */
+void fil_crypt_threads_signal(bool broadcast= false);
+
+/**
+ * CRYPT_SCHEME_UNENCRYPTED
+ *
+ * Used as intermediate state when convering a space from unencrypted
+ * to encrypted
+ */
+/**
+ * CRYPT_SCHEME_1
+ *
+ * xxx is AES_CTR or AES_CBC (or another block cypher with the same key and iv lengths)
+ *  L = AES_ECB(KEY, IV)
+ *  CRYPT(PAGE) = xxx(KEY=L, IV=C, PAGE)
+ */
+
+#define CRYPT_SCHEME_1 1
+#define CRYPT_SCHEME_1_IV_LEN 16
+#define CRYPT_SCHEME_UNENCRYPTED 0
+
+/* Cached L or key for given key_version */
+struct key_struct
+{
+	uint key_version;			/*!< Version of the key */
+	uint key_length;			/*!< Key length */
+	unsigned char key[MY_AES_MAX_KEY_LENGTH]; /*!< Cached key
+                                                (that is L in CRYPT_SCHEME_1) */
+};
+
+/** is encryption enabled */
+extern ulong	srv_encrypt_tables;
+
+/** Mutex helper for crypt_data->scheme
+@param[in, out]	schme	encryption scheme
+@param[in]	exit	should we exit or enter mutex ? */
+void
+crypt_data_scheme_locker(
+	st_encryption_scheme*	scheme,
+	int			exit);
+
+struct fil_space_rotate_state_t
+{
+	time_t start_time;	/*!< time when rotation started */
+	ulint active_threads;	/*!< active threads in space */
+	uint32_t next_offset;	/*!< next "free" offset */
+	uint32_t max_offset;	/*!< max offset needing to be rotated */
+	uint  min_key_version_found; /*!< min key version found but not
+				     rotated */
+	lsn_t end_lsn;		/*!< max lsn created when rotating this
+				space */
+	bool starting;		/*!< initial write of IV */
+	bool flushing;		/*!< space is being flushed at end of rotate */
+};
+
+#ifndef UNIV_INNOCHECKSUM
+
+struct fil_space_crypt_t : st_encryption_scheme
+{
+ public:
+	/** Constructor. Does not initialize the members!
+	The object is expected to be placed in a buffer that
+	has been zero-initialized. */
+	fil_space_crypt_t(
+		uint new_type,
+		uint new_min_key_version,
+		uint new_key_id,
+		fil_encryption_t new_encryption)
+		: st_encryption_scheme(),
+		min_key_version(new_min_key_version),
+		encryption(new_encryption),
+		key_found(0),
+		rotate_state()
+	{
+		key_id = new_key_id;
+		my_random_bytes(iv, sizeof(iv));
+		mysql_mutex_init(0, &mutex, nullptr);
+		locker = crypt_data_scheme_locker;
+		type = new_type;
+
+		if (new_encryption == FIL_ENCRYPTION_OFF ||
+			(!srv_encrypt_tables &&
+			 new_encryption == FIL_ENCRYPTION_DEFAULT)) {
+			type = CRYPT_SCHEME_UNENCRYPTED;
+		} else {
+			type = CRYPT_SCHEME_1;
+			min_key_version = key_get_latest_version();
+		}
+
+		key_found = min_key_version;
+	}
+
+	/** Destructor */
+	~fil_space_crypt_t()
+	{
+		mysql_mutex_destroy(&mutex);
+	}
+
+	/** Get latest key version from encryption plugin
+	@retval key_version or
+	@retval ENCRYPTION_KEY_VERSION_INVALID if used key_id
+	is not found from encryption plugin. */
+	uint key_get_latest_version(void);
+
+	/** Returns true if key was found from encryption plugin
+	and false if not. */
+	bool is_key_found() const {
+		return key_found != ENCRYPTION_KEY_VERSION_INVALID;
+	}
+
+	/** Returns true if tablespace should be encrypted */
+	bool should_encrypt() const {
+		return ((encryption == FIL_ENCRYPTION_ON) ||
+			(srv_encrypt_tables &&
+				encryption == FIL_ENCRYPTION_DEFAULT));
+	}
+
+	/** Return true if tablespace is encrypted. */
+	bool is_encrypted() const {
+		return (encryption != FIL_ENCRYPTION_OFF);
+	}
+
+	/** Return true if default tablespace encryption is used, */
+	bool is_default_encryption() const {
+		return (encryption == FIL_ENCRYPTION_DEFAULT);
+	}
+
+	/** Return true if tablespace is not encrypted. */
+	bool not_encrypted() const {
+		return (encryption == FIL_ENCRYPTION_OFF);
+	}
+
+	/** Write encryption metadata to the first page.
+	@param[in,out]	block	first page of the tablespace
+	@param[in,out]	mtr	mini-transaction */
+	void write_page0(buf_block_t* block, mtr_t* mtr);
+
+	uint min_key_version; // min key version for this space
+	fil_encryption_t encryption; // Encryption setup
+
+	mysql_mutex_t mutex;   // mutex protecting following variables
+
+	/** Return code from encryption_key_get_latest_version.
+        If ENCRYPTION_KEY_VERSION_INVALID encryption plugin
+	could not find the key and there is no need to call
+	get_latest_key_version again as keys are read only
+	at startup. */
+	uint key_found;
+
+	fil_space_rotate_state_t rotate_state;
+};
+
+/** Status info about encryption */
+struct fil_space_crypt_status_t {
+	ulint space;             /*!< tablespace id */
+	ulint scheme;            /*!< encryption scheme */
+	uint  min_key_version;   /*!< min key version */
+	uint  current_key_version;/*!< current key version */
+	uint  keyserver_requests;/*!< no of key requests to key server */
+	uint key_id;            /*!< current key_id */
+	bool rotating;           /*!< is key rotation ongoing */
+	bool flushing;           /*!< is flush at end of rotation ongoing */
+	ulint rotate_next_page_number; /*!< next page if key rotating */
+	ulint rotate_max_page_number;  /*!< max page if key rotating */
+};
+
+/** Statistics about encryption key rotation */
+struct fil_crypt_stat_t
+{
+  ulint pages_read_from_cache= 0;
+  ulint pages_read_from_disk= 0;
+  ulint pages_modified= 0;
+  ulint pages_flushed= 0;
+  ulint estimated_iops= 0;
+};
+
+/** Init space crypt */
+void fil_space_crypt_init();
+
+/** Cleanup space crypt */
+void fil_space_crypt_cleanup();
+
+/**
+Create a fil_space_crypt_t object
+@param[in]	encrypt_mode	FIL_ENCRYPTION_DEFAULT or
+				FIL_ENCRYPTION_ON or
+				FIL_ENCRYPTION_OFF
+
+@param[in]	key_id		Encryption key id
+@return crypt object */
+fil_space_crypt_t*
+fil_space_create_crypt_data(
+	fil_encryption_t	encrypt_mode,
+	uint			key_id)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Initialize encryption parameters from a tablespace header page.
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	page		first page of the tablespace
+@return crypt data from page 0
+@retval	NULL	if not present or not valid */
+fil_space_crypt_t* fil_space_read_crypt_data(ulint zip_size, const byte* page)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**
+Free a crypt data object
+@param[in,out] crypt_data	crypt data to be freed */
+void fil_space_destroy_crypt_data(fil_space_crypt_t **crypt_data);
+
+/** Amend encryption information from redo log.
+@param[in]	space	tablespace
+@param[in]	data	encryption metadata */
+void fil_crypt_parse(fil_space_t* space, const byte* data);
+
+/** Encrypt a buffer.
+@param[in,out]		crypt_data		Crypt data
+@param[in]		space			space_id
+@param[in]		offset			Page offset
+@param[in]		src_frame		Page to encrypt
+@param[in]		zip_size		ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]		dst_frame		Output buffer
+@param[in]		use_full_checksum	full crc32 algo is used
+@return encrypted buffer or NULL */
+byte*
+fil_encrypt_buf(
+	fil_space_crypt_t*	crypt_data,
+	ulint			space,
+	ulint			offset,
+	const byte*		src_frame,
+	ulint			zip_size,
+	byte*			dst_frame,
+	bool			use_full_checksum)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/**
+Encrypt a page.
+
+@param[in]		space		Tablespace
+@param[in]		offset		Page offset
+@param[in]		src_frame	Page to encrypt
+@param[in,out]		dst_frame	Output buffer
+@return encrypted buffer or NULL */
+byte* fil_space_encrypt(
+	const fil_space_t* space,
+	ulint		offset,
+	byte*		src_frame,
+	byte*		dst_frame)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Decrypt a page.
+@param]in]	space_id		space id
+@param[in]	fsp_flags		Tablespace flags
+@param[in]	crypt_data		crypt_data
+@param[in]	tmp_frame		Temporary buffer
+@param[in]	physical_size		page size
+@param[in,out]	src_frame		Page to decrypt
+@retval DB_SUCCESS on success
+@retval DB_DECRYPTION_FAILED on error */
+dberr_t
+fil_space_decrypt(
+	uint32_t		space_id,
+	uint32_t		fsp_flags,
+	fil_space_crypt_t*	crypt_data,
+	byte*			tmp_frame,
+	ulint			physical_size,
+	byte*			src_frame);
+
+/******************************************************************
+Decrypt a page
+@param[in]	space			Tablespace
+@param[in]	tmp_frame		Temporary buffer used for decrypting
+@param[in,out]	src_frame		Page to decrypt
+@return decrypted page, or original not encrypted page if decryption is
+not needed.
+@retval nullptr on failure */
+byte*
+fil_space_decrypt(
+	const fil_space_t* space,
+	byte*		tmp_frame,
+	byte*		src_frame)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************
+Adjust thread count for key rotation
+@param[in]	enw_cnt		Number of threads to be used */
+void fil_crypt_set_thread_cnt(const uint new_cnt);
+
+/*********************************************************************
+Adjust max key age
+@param[in]	val		New max key age */
+void fil_crypt_set_rotate_key_age(uint val);
+
+/*********************************************************************
+Adjust rotation iops
+@param[in]	val		New max roation iops */
+void fil_crypt_set_rotation_iops(uint val);
+
+/*********************************************************************
+Adjust encrypt tables
+@param[in]	val		New setting for innodb-encrypt-tables */
+void fil_crypt_set_encrypt_tables(ulong val);
+
+/*********************************************************************
+Init threads for key rotation */
+void fil_crypt_threads_init();
+
+/*********************************************************************
+Clean up key rotation threads resources */
+void fil_crypt_threads_cleanup();
+
+/*********************************************************************
+Wait for crypt threads to stop accessing space
+@param[in]	space		Tablespace */
+void fil_space_crypt_close_tablespace(const fil_space_t *space);
+
+/*********************************************************************
+Get crypt status for a space (used by information_schema)
+@param[in]	space		Tablespace
+@param[out]	status		Crypt status
+return 0 if crypt data present */
+void
+fil_space_crypt_get_status(
+	const fil_space_t*			space,
+	struct fil_space_crypt_status_t*	status);
+
+/*********************************************************************
+Return crypt statistics
+@param[out]	stat		Crypt statistics */
+void fil_crypt_total_stat(fil_crypt_stat_t *stat);
+
+#include "fil0crypt.inl"
+#endif /* !UNIV_INNOCHECKSUM */
+
+/**
+Verify that post encryption checksum match calculated checksum.
+This function should be called only if tablespace contains crypt_data
+metadata (this is strong indication that tablespace is encrypted).
+Function also verifies that traditional checksum does not match
+calculated checksum as if it does page could be valid unencrypted,
+encrypted, or corrupted.
+
+@param[in,out]	page		page frame (checksum is temporarily modified)
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@return true if page is encrypted AND OK, false otherwise */
+bool fil_space_verify_crypt_checksum(const byte* page, ulint zip_size)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Add the tablespace to the rotation list if
+innodb_encrypt_rotate_key_age is 0 or encryption plugin does
+not do key version rotation
+@return whether the tablespace should be added to rotation list */
+bool fil_crypt_must_default_encrypt();
+
+#endif /* fil0crypt_h */
diff --git a/storage/innobase/include/fil0crypt.inl b/storage/innobase/include/fil0crypt.inl
new file mode 100644
index 00000000..cc59b394
--- /dev/null
+++ b/storage/innobase/include/fil0crypt.inl
@@ -0,0 +1,81 @@
+/*****************************************************************************
+
+Copyright (c) 2015, 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fil0crypt.ic
+The low-level file system encryption support functions
+
+Created 04/01/2015 Jan Lindström
+*******************************************************/
+
+/*******************************************************************//**
+Find out whether the page is page encrypted
+@return	true if page is page encrypted, false if not */
+UNIV_INLINE
+bool
+fil_page_is_encrypted(
+/*==================*/
+	const byte *buf)	/*!< in: page */
+{
+	return(mach_read_from_4(buf+FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) != 0);
+}
+
+/*******************************************************************//**
+Get current encryption mode from crypt_data.
+@return string representation */
+UNIV_INLINE
+const char *
+fil_crypt_get_mode(
+/*===============*/
+	const fil_space_crypt_t* crypt_data)
+{
+	switch (crypt_data->encryption) {
+	case FIL_ENCRYPTION_DEFAULT:
+		return("Default tablespace encryption mode");
+	case FIL_ENCRYPTION_ON:
+		return("Tablespace encrypted");
+	case FIL_ENCRYPTION_OFF:
+		return("Tablespace not encrypted");
+	}
+
+	ut_error;
+	return ("NULL");
+}
+
+/*******************************************************************//**
+Get current encryption type from crypt_data.
+@return string representation */
+UNIV_INLINE
+const char *
+fil_crypt_get_type(
+	const fil_space_crypt_t* crypt_data)
+{
+	ut_ad(crypt_data != NULL);
+	switch (crypt_data->type) {
+	case CRYPT_SCHEME_UNENCRYPTED:
+		return("scheme unencrypted");
+		break;
+	case CRYPT_SCHEME_1:
+		return("scheme encrypted");
+		break;
+	default:
+		ut_error;
+	}
+
+	return ("NULL");
+}
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
new file mode 100644
index 00000000..6f58e3c1
--- /dev/null
+++ b/storage/innobase/include/fil0fil.h
@@ -0,0 +1,1823 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fil0fil.h
+The low-level file system
+
+Created 10/25/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "fsp0types.h"
+#include "mach0data.h"
+#include "assume_aligned.h"
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "srw_lock.h"
+#include "buf0dblwr.h"
+#include "hash0hash.h"
+#include "log0recv.h"
+#include "dict0types.h"
+#include "ilist.h"
+#include <set>
+#include <mutex>
+
+struct unflushed_spaces_tag_t;
+struct default_encrypt_tag_t;
+struct space_list_tag_t;
+struct named_spaces_tag_t;
+
+using space_list_t= ilist<fil_space_t, space_list_tag_t>;
+
+// Forward declaration
+extern my_bool srv_use_doublewrite_buf;
+
+/** Possible values of innodb_flush_method */
+enum srv_flush_t
+{
+  /** fsync, the default */
+  SRV_FSYNC= 0,
+  /** open log files in O_DSYNC mode */
+  SRV_O_DSYNC,
+  /** do not call os_file_flush() when writing data files, but do flush
+  after writing to log files */
+  SRV_LITTLESYNC,
+  /** do not flush after writing */
+  SRV_NOSYNC,
+  /** invoke os_file_set_nocache() on data files. This implies using
+  unbuffered I/O but still fdatasync(), because some filesystems might
+  not flush meta-data on write completion */
+  SRV_O_DIRECT,
+  /** Like O_DIRECT, but skip fdatasync(), assuming that the data is
+  durable on write completion */
+  SRV_O_DIRECT_NO_FSYNC
+#ifdef _WIN32
+  /** Traditional Windows appoach to open all files without caching,
+  and do FileFlushBuffers() */
+  ,SRV_ALL_O_DIRECT_FSYNC
+#endif
+};
+
+/** innodb_flush_method */
+extern ulong srv_file_flush_method;
+
+/** Undo tablespaces starts with space_id. */
+extern uint32_t srv_undo_space_id_start;
+/** The number of UNDO tablespaces that are open and ready to use. */
+extern uint32_t srv_undo_tablespaces_open;
+
+/** Check whether given space id is undo tablespace id
+@param[in]	space_id	space id to check
+@return true if it is undo tablespace else false. */
+inline bool srv_is_undo_tablespace(uint32_t space_id)
+{
+  return srv_undo_space_id_start > 0 &&
+    space_id >= srv_undo_space_id_start &&
+    space_id < srv_undo_space_id_start + srv_undo_tablespaces_open;
+}
+
+class page_id_t;
+
+/** Structure containing encryption specification */
+struct fil_space_crypt_t;
+
+/** File types */
+enum fil_type_t {
+	/** temporary tablespace (temporary undo log or tables) */
+	FIL_TYPE_TEMPORARY,
+	/** a tablespace that is being imported (no logging until finished) */
+	FIL_TYPE_IMPORT,
+	/** persistent tablespace (for system, undo log or tables) */
+	FIL_TYPE_TABLESPACE,
+};
+
+struct fil_node_t;
+
+/** Structure to store first and last value of range */
+struct range_t
+{
+  uint32_t first;
+  uint32_t last;
+};
+
+/** Sort the range based on first value of the range */
+struct range_compare
+{
+  bool operator() (const range_t lhs, const range_t rhs) const
+  {
+    return lhs.first < rhs.first;
+  }
+};
+
+using range_set_t= std::set<range_t, range_compare>;
+/** Range to store the set of ranges of integers */
+class range_set
+{
+private:
+  range_set_t ranges;
+
+  range_set_t::iterator find(uint32_t value) const
+  {
+    auto r_offset= ranges.lower_bound({value, value});
+    const auto r_end= ranges.end();
+    if (r_offset != r_end);
+    else if (empty())
+      return r_end;
+    else
+      r_offset= std::prev(r_end);
+    if (r_offset->first <= value && r_offset->last >= value)
+      return r_offset;
+    return r_end;
+  }
+public:
+  /** Merge the current range with previous range.
+  @param[in] range      range to be merged
+  @param[in] prev_range range to be merged with next */
+  void merge_range(range_set_t::iterator range,
+		   range_set_t::iterator prev_range)
+  {
+    if (range->first != prev_range->last + 1)
+      return;
+
+    /* Merge the current range with previous range */
+    range_t new_range {prev_range->first, range->last};
+    ranges.erase(prev_range);
+    ranges.erase(range);
+    ranges.emplace(new_range);
+  }
+
+  /** Split the range and add two more ranges
+  @param[in] range	range to be split
+  @param[in] value	Value to be removed from range */
+  void split_range(range_set_t::iterator range, uint32_t value)
+  {
+    range_t split1{range->first, value - 1};
+    range_t split2{value + 1, range->last};
+
+    /* Remove the existing element */
+    ranges.erase(range);
+
+    /* Insert the two elements */
+    ranges.emplace(split1);
+    ranges.emplace(split2);
+  }
+
+  /** Remove the value with the given range
+  @param[in,out] range  range to be changed
+  @param[in]	 value	value to be removed */
+  void remove_within_range(range_set_t::iterator range, uint32_t value)
+  {
+    range_t new_range{range->first, range->last};
+    if (value == range->first)
+    {
+      if (range->first == range->last)
+      {
+        ranges.erase(range);
+        return;
+      }
+      else
+        new_range.first++;
+    }
+    else if (value == range->last)
+      new_range.last--;
+    else if (range->first < value && range->last > value)
+      return split_range(range, value);
+
+    ranges.erase(range);
+    ranges.emplace(new_range);
+  }
+
+  /** Remove the value from the ranges.
+  @param[in]	value	Value to be removed. */
+  void remove_value(uint32_t value)
+  {
+    if (empty())
+      return;
+    range_t new_range {value, value};
+    range_set_t::iterator range= ranges.lower_bound(new_range);
+    if (range == ranges.end())
+      return remove_within_range(std::prev(range), value);
+
+    if (range->first > value && range != ranges.begin())
+      /* Iterate the previous ranges to delete */
+      return remove_within_range(std::prev(range), value);
+    return remove_within_range(range, value);
+  }
+  /** Add the value within the existing range
+  @param[in]	range	range to be modified
+  @param[in]	value	value to be added */
+  range_set_t::iterator add_within_range(range_set_t::iterator range,
+                                         uint32_t value)
+  {
+    if (range->first <= value && range->last >= value)
+      return range;
+
+    range_t new_range{range->first, range->last};
+    if (range->last + 1 == value)
+      new_range.last++;
+    else if (range->first - 1 == value)
+      new_range.first--;
+    else return ranges.end();
+    ranges.erase(range);
+    return ranges.emplace(new_range).first;
+  }
+  /** Add the range in the ranges set
+  @param[in]	new_range	range to be added */
+  void add_range(range_t new_range)
+  {
+    auto r_offset= ranges.lower_bound(new_range);
+    auto r_begin= ranges.begin();
+    auto r_end= ranges.end();
+    if (!ranges.size())
+    {
+new_range:
+      ranges.emplace(new_range);
+      return;
+    }
+
+    if (r_offset == r_end)
+    {
+      /* last range */
+      if (add_within_range(std::prev(r_offset), new_range.first) == r_end)
+        goto new_range;
+    }
+    else if (r_offset == r_begin)
+    {
+      /* First range */
+      if (add_within_range(r_offset, new_range.first) == r_end)
+        goto new_range;
+    }
+    else if (r_offset->first - 1 == new_range.first)
+    {
+      /* Change starting of the existing range */
+      auto r_value= add_within_range(r_offset, new_range.first);
+      if (r_value != ranges.begin())
+        merge_range(r_value, std::prev(r_value));
+    }
+    else
+    {
+      /* previous range last_value alone */
+      if (add_within_range(std::prev(r_offset), new_range.first) == r_end)
+        goto new_range;
+    }
+  }
+
+ /** Add the value in the ranges
+ @param[in] value  value to be added */
+  void add_value(uint32_t value)
+  {
+    range_t new_range{value, value};
+    add_range(new_range);
+  }
+
+  bool remove_if_exists(uint32_t value)
+  {
+    auto r_offset= find(value);
+    if (r_offset != ranges.end())
+    {
+      remove_within_range(r_offset, value);
+      return true;
+    }
+    return false;
+  }
+
+  bool contains(uint32_t value) const
+  {
+    return find(value) != ranges.end();
+  }
+
+  ulint size() { return ranges.size(); }
+  void clear() { ranges.clear(); }
+  bool empty() const { return ranges.empty(); }
+  typename range_set_t::iterator begin() { return ranges.begin(); }
+  typename range_set_t::iterator end() { return ranges.end(); }
+};
+#endif
+
+/** Tablespace or log data space */
+#ifndef UNIV_INNOCHECKSUM
+struct fil_io_t
+{
+  /** error code */
+  dberr_t err;
+  /** file; node->space->release() must follow IORequestRead call */
+  fil_node_t *node;
+};
+
+/** Tablespace encryption mode */
+enum fil_encryption_t
+{
+  /** Encrypted if innodb_encrypt_tables=ON (srv_encrypt_tables) */
+  FIL_ENCRYPTION_DEFAULT,
+  /** Encrypted */
+  FIL_ENCRYPTION_ON,
+  /** Not encrypted */
+  FIL_ENCRYPTION_OFF
+};
+
+struct fil_space_t final : ilist_node<unflushed_spaces_tag_t>,
+                           ilist_node<default_encrypt_tag_t>,
+                           ilist_node<space_list_tag_t>,
+                           ilist_node<named_spaces_tag_t>
+#else
+struct fil_space_t final
+#endif
+{
+#ifndef UNIV_INNOCHECKSUM
+  friend fil_node_t;
+  ~fil_space_t()
+  {
+    ut_ad(!latch_owner);
+    ut_ad(!latch_count);
+    latch.destroy();
+  }
+
+  /** fil_system.spaces chain node */
+  fil_space_t *hash;
+  /** LSN of the most recent fil_names_write_if_was_clean().
+  Reset to 0 by fil_names_clear(). Protected by exclusive log_sys.latch.
+  If and only if max_lsn is nonzero, this is in fil_system.named_spaces. */
+  lsn_t max_lsn;
+  /** tablespace identifier */
+  uint32_t id;
+	/** whether undo tablespace truncation is in progress */
+	bool		is_being_truncated;
+	fil_type_t	purpose;/*!< purpose */
+	UT_LIST_BASE_NODE_T(fil_node_t) chain;
+				/*!< base node for the file chain */
+	uint32_t	size;	/*!< tablespace file size in pages;
+				0 if not known yet */
+	uint32_t	size_in_header;
+				/* FSP_SIZE in the tablespace header;
+				0 if not known yet */
+	uint32_t	free_len;
+				/*!< length of the FSP_FREE list */
+	uint32_t	free_limit;
+				/*!< contents of FSP_FREE_LIMIT */
+	uint32_t	recv_size;
+				/*!< recovered tablespace size in pages;
+				0 if no size change was read from the redo log,
+				or if the size change was implemented */
+	uint32_t	n_reserved_extents;
+				/*!< number of reserved free extents for
+				ongoing operations like B-tree page split */
+private:
+#ifdef UNIV_DEBUG
+  fil_space_t *next_in_space_list();
+  fil_space_t *prev_in_space_list();
+
+  fil_space_t *next_in_unflushed_spaces();
+  fil_space_t *prev_in_unflushed_spaces();
+#endif
+
+  /** the committed size of the tablespace in pages */
+  Atomic_relaxed<uint32_t> committed_size;
+  /** Number of pending operations on the file.
+  The tablespace cannot be freed while (n_pending & PENDING) != 0. */
+  std::atomic<uint32_t> n_pending;
+  /** Flag in n_pending that indicates that the tablespace is about to be
+  deleted, and no further operations should be performed */
+  static constexpr uint32_t STOPPING_READS= 1U << 31;
+  /** Flag in n_pending that indicates that the tablespace is being
+  deleted, and no further operations should be performed */
+  static constexpr uint32_t STOPPING_WRITES= 1U << 30;
+  /** Flags in n_pending that indicate that the tablespace is being
+  deleted, and no further operations should be performed */
+  static constexpr uint32_t STOPPING= STOPPING_READS | STOPPING_WRITES;
+  /** Flag in n_pending that indicates that the tablespace is a candidate
+  for being closed, and fil_node_t::is_open() can only be trusted after
+  acquiring fil_system.mutex and resetting the flag */
+  static constexpr uint32_t CLOSING= 1U << 29;
+  /** Flag in n_pending that indicates that the tablespace needs fsync().
+  This must be the least significant flag bit; @see release_flush() */
+  static constexpr uint32_t NEEDS_FSYNC= 1U << 28;
+  /** The reference count */
+  static constexpr uint32_t PENDING= ~(STOPPING | CLOSING | NEEDS_FSYNC);
+  /** latch protecting all page allocation bitmap pages */
+  srw_lock latch;
+  pthread_t latch_owner;
+  ut_d(Atomic_relaxed<uint32_t> latch_count;)
+public:
+  /** MariaDB encryption data */
+  fil_space_crypt_t *crypt_data;
+
+  /** Whether needs_flush(), or this is in fil_system.unflushed_spaces */
+  bool is_in_unflushed_spaces;
+
+  /** Whether this in fil_system.default_encrypt_tables (needs key rotation) */
+  bool is_in_default_encrypt;
+
+private:
+  /** Whether any corrupton of this tablespace has been reported */
+  mutable std::atomic_flag is_corrupted;
+
+public:
+  /** mutex to protect freed_ranges and last_freed_lsn */
+  std::mutex freed_range_mutex;
+private:
+  /** Ranges of freed page numbers; protected by freed_range_mutex */
+  range_set freed_ranges;
+
+  /** LSN of freeing last page; protected by freed_range_mutex */
+  lsn_t last_freed_lsn;
+
+public:
+  /** @return whether doublewrite buffering is needed */
+  inline bool use_doublewrite() const;
+
+  /** @return whether a page has been freed */
+  inline bool is_freed(uint32_t page);
+
+  /** Apply freed_ranges to the file.
+  @param writable whether the file is writable
+  @return number of pages written or hole-punched */
+  uint32_t flush_freed(bool writable);
+
+	/** Append a file to the chain of files of a space.
+	@param[in]	name		file name of a file that is not open
+	@param[in]	handle		file handle, or OS_FILE_CLOSED
+	@param[in]	size		file size in entire database pages
+	@param[in]	is_raw		whether this is a raw device
+	@param[in]	atomic_write	true if atomic write could be enabled
+	@param[in]	max_pages	maximum number of pages in file,
+	or UINT32_MAX for unlimited
+	@return file object */
+	fil_node_t* add(const char* name, pfs_os_file_t handle,
+			uint32_t size, bool is_raw, bool atomic_write,
+			uint32_t max_pages = UINT32_MAX);
+#ifdef UNIV_DEBUG
+	/** Assert that the mini-transaction is compatible with
+	updating an allocation bitmap page.
+	@param[in]	mtr	mini-transaction */
+	void modify_check(const mtr_t& mtr) const;
+#endif /* UNIV_DEBUG */
+
+	/** Try to reserve free extents.
+	@param[in]	n_free_now	current number of free extents
+	@param[in]	n_to_reserve	number of extents to reserve
+	@return	whether the reservation succeeded */
+	bool reserve_free_extents(uint32_t n_free_now, uint32_t n_to_reserve)
+	{
+		if (n_reserved_extents + n_to_reserve > n_free_now) {
+			return false;
+		}
+
+		n_reserved_extents += n_to_reserve;
+		return true;
+	}
+
+	/** Release the reserved free extents.
+	@param[in]	n_reserved	number of reserved extents */
+	void release_free_extents(uint32_t n_reserved)
+	{
+		if (!n_reserved) return;
+		ut_a(n_reserved_extents >= n_reserved);
+		n_reserved_extents -= n_reserved;
+	}
+
+  /** Rename a file.
+  @param[in]	path	tablespace file name after renaming
+  @param[in]	log	whether to write redo log
+  @param[in]	replace	whether to ignore the existence of path
+  @return	error code
+  @retval	DB_SUCCESS	on success */
+  dberr_t rename(const char *path, bool log, bool replace= false)
+    MY_ATTRIBUTE((nonnull));
+
+  /** Note that the tablespace has been imported.
+  Initially, purpose=FIL_TYPE_IMPORT so that no redo log is
+  written while the space ID is being updated in each page. */
+  inline void set_imported();
+
+  /** Report the tablespace as corrupted */
+  ATTRIBUTE_COLD void set_corrupted() const;
+
+  /** @return whether the storage device is rotational (HDD, not SSD) */
+  inline bool is_rotational() const;
+
+  /** Open each file. Never invoked on .ibd files.
+  @param create_new_db    whether to skip the call to fil_node_t::read_page0()
+  @return whether all files were opened */
+  bool open(bool create_new_db);
+  /** Close each file. Only invoked on fil_system.temp_space. */
+  void close();
+
+  /** Note that operations on the tablespace must stop. */
+  inline void set_stopping();
+
+  /** Note that operations on the tablespace can resume after truncation */
+  inline void clear_stopping();
+
+  /** Drop the tablespace and wait for any pending operations to cease
+  @param id               tablespace identifier
+  @param detached_handle  pointer to file to be closed later, or nullptr
+  @return tablespace to invoke fil_space_free() on
+  @retval nullptr if no tablespace was found, or it was deleted by
+  another concurrent thread */
+  static fil_space_t *drop(uint32_t id, pfs_os_file_t *detached_handle);
+
+private:
+  MY_ATTRIBUTE((warn_unused_result))
+  /** Try to acquire a tablespace reference (increment referenced()).
+  @param avoid   when these flags are set, nothing will be acquired
+  @return the old reference count */
+  uint32_t acquire_low(uint32_t avoid= STOPPING)
+  {
+    uint32_t n= 0;
+    while (!n_pending.compare_exchange_strong(n, n + 1,
+                                              std::memory_order_acquire,
+                                              std::memory_order_relaxed) &&
+           !(n & avoid));
+    return n;
+  }
+public:
+  MY_ATTRIBUTE((warn_unused_result))
+  /** Acquire a tablespace reference.
+  @return whether a tablespace reference was successfully acquired */
+  inline bool acquire_if_not_stopped();
+
+  MY_ATTRIBUTE((warn_unused_result))
+  /** Acquire a tablespace reference for I/O.
+  @param avoid   when these flags are set, nothing will be acquired
+  @return whether the file is usable */
+  bool acquire(uint32_t avoid= STOPPING | CLOSING)
+  {
+    const auto flags= acquire_low(avoid) & (avoid);
+    return UNIV_LIKELY(!flags) || (flags == CLOSING && acquire_and_prepare());
+  }
+
+  /** Acquire a tablespace reference for writing.
+  @param avoid   when these flags are set, nothing will be acquired
+  @return whether the file is writable */
+  bool acquire_for_write() { return acquire(STOPPING_WRITES | CLOSING); }
+
+  /** Acquire another tablespace reference for I/O. */
+  inline void reacquire();
+
+  /** Release a tablespace reference.
+  @return whether this was the last reference */
+  bool release()
+  {
+    uint32_t n= n_pending.fetch_sub(1, std::memory_order_release);
+    ut_ad(n & PENDING);
+    return (n & PENDING) == 1;
+  }
+
+  /** Clear the NEEDS_FSYNC flag */
+  void clear_flush()
+  {
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    static_assert(NEEDS_FSYNC == 1U << 28, "compatibility");
+    __asm__ __volatile__("lock btrl $28, %0" : "+m" (n_pending));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+    static_assert(NEEDS_FSYNC == 1U << 28, "compatibility");
+    _interlockedbittestandreset(reinterpret_cast<volatile long*>
+                                (&n_pending), 28);
+#else
+    n_pending.fetch_and(~NEEDS_FSYNC, std::memory_order_release);
+#endif
+  }
+
+private:
+  /** Clear the CLOSING flag */
+  void clear_closing()
+  {
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    static_assert(CLOSING == 1U << 29, "compatibility");
+    __asm__ __volatile__("lock btrl $29, %0" : "+m" (n_pending));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+    static_assert(CLOSING == 1U << 29, "compatibility");
+    _interlockedbittestandreset(reinterpret_cast<volatile long*>
+                                (&n_pending), 29);
+#else
+    n_pending.fetch_and(~CLOSING, std::memory_order_relaxed);
+#endif
+  }
+
+  /** @return pending operations (and flags) */
+  uint32_t pending()const { return n_pending.load(std::memory_order_acquire); }
+public:
+  /** @return whether close() of the file handle has been requested */
+  bool is_closing() const { return pending() & CLOSING; }
+  /** @return whether the tablespace is about to be dropped */
+  bool is_stopping() const { return pending() & STOPPING; }
+  /** @return whether the tablespace is going to be dropped */
+  bool is_stopping_writes() const { return pending() & STOPPING_WRITES; }
+  /** @return number of pending operations */
+  bool is_ready_to_close() const
+  { return (pending() & (PENDING | CLOSING)) == CLOSING; }
+  /** @return whether fsync() or similar is needed */
+  bool needs_flush() const { return pending() & NEEDS_FSYNC; }
+  /** @return whether fsync() or similar is needed, and the tablespace is
+  not being dropped  */
+  bool needs_flush_not_stopping() const
+  { return (pending() & (NEEDS_FSYNC | STOPPING_WRITES)) == NEEDS_FSYNC; }
+
+  uint32_t referenced() const { return pending() & PENDING; }
+private:
+  MY_ATTRIBUTE((warn_unused_result))
+  /** Prepare to close the file handle.
+  @return number of pending operations, possibly with NEEDS_FSYNC flag */
+  uint32_t set_closing()
+  {
+    return n_pending.fetch_or(CLOSING, std::memory_order_acquire);
+  }
+
+public:
+  /** Try to close a file to adhere to the innodb_open_files limit.
+  @param print_info   whether to diagnose why a file cannot be closed
+  @return whether a file was closed */
+  static bool try_to_close(bool print_info);
+
+  /** Close all tablespace files at shutdown */
+  static void close_all();
+
+  /** Update last_freed_lsn */
+  void update_last_freed_lsn(lsn_t lsn) { last_freed_lsn= lsn; }
+
+  /** Note that the file will need fsync().
+  @return whether this needs to be added to fil_system.unflushed_spaces */
+  bool set_needs_flush()
+  {
+    uint32_t n= 1;
+    while (!n_pending.compare_exchange_strong(n, n | NEEDS_FSYNC,
+                                              std::memory_order_acquire,
+                                              std::memory_order_relaxed))
+    {
+      ut_ad(n & PENDING);
+      if (n & (NEEDS_FSYNC | STOPPING_WRITES))
+        return false;
+    }
+
+    return true;
+  }
+
+  /** Clear all freed ranges for undo tablespace when InnoDB
+  encounters TRIM redo log record */
+  void clear_freed_ranges() { freed_ranges.clear(); }
+#endif /* !UNIV_INNOCHECKSUM */
+  /** FSP_SPACE_FLAGS and FSP_FLAGS_MEM_ flags;
+  check fsp0types.h to more info about flags. */
+  uint32_t flags;
+
+  /** Determine if full_crc32 is used for a data file
+  @param[in]	flags	tablespace flags (FSP_SPACE_FLAGS)
+  @return whether the full_crc32 algorithm is active */
+  static bool full_crc32(uint32_t flags)
+  { return flags & FSP_FLAGS_FCRC32_MASK_MARKER; }
+  /** @return whether innodb_checksum_algorithm=full_crc32 is active */
+  bool full_crc32() const { return full_crc32(flags); }
+  /** Determine if full_crc32 is used along with PAGE_COMPRESSED */
+  static bool is_full_crc32_compressed(uint32_t flags)
+  {
+    if (!full_crc32(flags))
+      return false;
+    auto algo= FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO(flags);
+    DBUG_ASSERT(algo <= PAGE_ALGORITHM_LAST);
+    return algo != 0;
+  }
+  /** Determine the logical page size.
+  @param flags	tablespace flags (FSP_SPACE_FLAGS)
+  @return the logical page size
+  @retval 0 if the flags are invalid */
+  static unsigned logical_size(uint32_t flags)
+  {
+    switch (full_crc32(flags)
+            ? FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags)
+            : FSP_FLAGS_GET_PAGE_SSIZE(flags)) {
+    case 3: return 4096;
+    case 4: return 8192;
+    case 5: return full_crc32(flags) ? 16384 : 0;
+    case 0: return full_crc32(flags) ? 0 : 16384;
+    case 6: return 32768;
+    case 7: return 65536;
+    default: return 0;
+    }
+  }
+  /** Determine the ROW_FORMAT=COMPRESSED page size.
+  @param flags	tablespace flags (FSP_SPACE_FLAGS)
+  @return the ROW_FORMAT=COMPRESSED page size
+  @retval 0	if ROW_FORMAT=COMPRESSED is not used */
+  static unsigned zip_size(uint32_t flags)
+  {
+    if (full_crc32(flags))
+      return 0;
+    const uint32_t zip_ssize= FSP_FLAGS_GET_ZIP_SSIZE(flags);
+    return zip_ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize : 0;
+  }
+  /** Determine the physical page size.
+  @param flags	tablespace flags (FSP_SPACE_FLAGS)
+  @return the physical page size */
+  static unsigned physical_size(uint32_t flags)
+  {
+    if (full_crc32(flags))
+      return logical_size(flags);
+
+    const uint32_t zip_ssize= FSP_FLAGS_GET_ZIP_SSIZE(flags);
+    return zip_ssize
+      ? (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize
+      : unsigned(srv_page_size);
+  }
+
+  /** @return the ROW_FORMAT=COMPRESSED page size
+  @retval 0  if ROW_FORMAT=COMPRESSED is not used */
+  unsigned zip_size() const { return zip_size(flags); }
+  /** @return the physical page size */
+  unsigned physical_size() const { return physical_size(flags); }
+
+  /** Check whether PAGE_COMPRESSED is enabled.
+  @param[in]	flags	tablespace flags */
+  static bool is_compressed(uint32_t flags)
+  {
+    return is_full_crc32_compressed(flags) ||
+      FSP_FLAGS_HAS_PAGE_COMPRESSION(flags);
+  }
+  /** @return whether the compression enabled for the tablespace. */
+  bool is_compressed() const { return is_compressed(flags); }
+
+  /** Get the compression algorithm for full crc32 format.
+  @param flags contents of FSP_SPACE_FLAGS
+  @return PAGE_COMPRESSED algorithm of full_crc32 tablespace
+  @retval 0 if not PAGE_COMPRESSED or not full_crc32 */
+  static unsigned get_compression_algo(uint32_t flags)
+  {
+    return full_crc32(flags)
+      ? FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO(flags)
+      : 0;
+  }
+  /** @return the page_compressed algorithm
+  @retval 0 if not page_compressed */
+  unsigned get_compression_algo() const { return get_compression_algo(flags); }
+  /** Determine if the page_compressed page contains an extra byte
+  for exact compressed stream length
+  @param flags   contents of FSP_SPACE_FLAGS
+  @return whether the extra byte is needed */
+  static bool full_crc32_page_compressed_len(uint32_t flags)
+  {
+    DBUG_ASSERT(full_crc32(flags));
+    switch (get_compression_algo(flags)) {
+    case PAGE_LZ4_ALGORITHM:
+    case PAGE_LZO_ALGORITHM:
+    case PAGE_SNAPPY_ALGORITHM:
+      return true;
+    }
+    return false;
+  }
+
+  /** Whether the full checksum matches with non full checksum flags.
+  @param flags    contents of FSP_SPACE_FLAGS
+  @param expected expected flags
+  @return true if it is equivalent */
+  static bool is_flags_full_crc32_equal(uint32_t flags, uint32_t expected)
+  {
+    ut_ad(full_crc32(flags));
+    uint32_t fcrc32_psize= FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags);
+
+    if (full_crc32(expected))
+      /* The data file may have been created with a
+      different innodb_compression_algorithm. But
+      we only support one innodb_page_size for all files. */
+      return fcrc32_psize == FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(expected);
+
+    uint32_t non_fcrc32_psize = FSP_FLAGS_GET_PAGE_SSIZE(expected);
+    if (!non_fcrc32_psize)
+      return fcrc32_psize == 5;
+    return fcrc32_psize == non_fcrc32_psize;
+  }
+
+  /** Whether old tablespace flags match full_crc32 flags.
+  @param flags    contents of FSP_SPACE_FLAGS
+  @param expected expected flags
+  @return true if it is equivalent */
+  static bool is_flags_non_full_crc32_equal(uint32_t flags, uint32_t expected)
+  {
+    ut_ad(!full_crc32(flags));
+    if (!full_crc32(expected))
+      return false;
+
+    uint32_t non_fcrc32_psize= FSP_FLAGS_GET_PAGE_SSIZE(flags);
+    uint32_t fcrc32_psize = FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(expected);
+
+    if (!non_fcrc32_psize)
+      return fcrc32_psize == 5;
+    return fcrc32_psize == non_fcrc32_psize;
+  }
+
+  /** Whether both fsp flags are equivalent */
+  static bool is_flags_equal(uint32_t flags, uint32_t expected)
+  {
+    if (!((flags ^ expected) & ~(1U << FSP_FLAGS_POS_RESERVED)))
+      return true;
+    return full_crc32(flags)
+       ? is_flags_full_crc32_equal(flags, expected)
+       : is_flags_non_full_crc32_equal(flags, expected);
+  }
+
+  /** Validate the tablespace flags for full crc32 format.
+  @param flags contents of FSP_SPACE_FLAGS
+  @return whether the flags are correct in full crc32 format */
+  static bool is_fcrc32_valid_flags(uint32_t flags)
+  {
+    ut_ad(flags & FSP_FLAGS_FCRC32_MASK_MARKER);
+    const ulint page_ssize= physical_size(flags);
+    if (page_ssize < 3 || page_ssize & 8)
+      return false;
+    flags >>= FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO;
+    return flags <= PAGE_ALGORITHM_LAST;
+  }
+  /** Validate the tablespace flags.
+  @param flags	contents of FSP_SPACE_FLAGS
+  @param is_ibd	whether this is an .ibd file (not system tablespace)
+  @return whether the flags are correct */
+  static bool is_valid_flags(uint32_t flags, bool is_ibd)
+  {
+    DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return false;);
+    if (full_crc32(flags))
+      return is_fcrc32_valid_flags(flags);
+
+    if (flags == 0)
+      return true;
+    if (~FSP_FLAGS_MASK & flags)
+      return false;
+
+    if (FSP_FLAGS_MASK_ATOMIC_BLOBS ==
+        (flags & (FSP_FLAGS_MASK_POST_ANTELOPE | FSP_FLAGS_MASK_ATOMIC_BLOBS)))
+      /* If the "atomic blobs" flag (indicating
+      ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED) flag is set, then the
+      ROW_FORMAT!=REDUNDANT flag must also be set. */
+      return false;
+
+    /* Bits 10..14 should be 0b0000d where d is the DATA_DIR flag
+    of MySQL 5.6 and MariaDB 10.0, which we ignore.
+    In the buggy FSP_SPACE_FLAGS written by MariaDB 10.1.0 to 10.1.20,
+    bits 10..14 would be nonzero 0bsssaa where sss is
+    nonzero PAGE_SSIZE (3, 4, 6, or 7)
+    and aa is ATOMIC_WRITES (not 0b11). */
+    if (FSP_FLAGS_GET_RESERVED(flags) & ~1U)
+      return false;
+
+    const uint32_t ssize= FSP_FLAGS_GET_PAGE_SSIZE(flags);
+    if (ssize == 1 || ssize == 2 || ssize == 5 || ssize & 8)
+      /* the page_size is not between 4k and 64k;
+      16k should be encoded as 0, not 5 */
+      return false;
+
+    const uint32_t zssize= FSP_FLAGS_GET_ZIP_SSIZE(flags);
+    if (zssize == 0)
+      /* not ROW_FORMAT=COMPRESSED */;
+    else if (zssize > (ssize ? ssize : 5))
+      /* Invalid KEY_BLOCK_SIZE */
+      return false;
+    else if (~flags &
+             (FSP_FLAGS_MASK_POST_ANTELOPE | FSP_FLAGS_MASK_ATOMIC_BLOBS))
+     /* both these flags must set for ROW_FORMAT=COMPRESSED */
+     return false;
+
+    /* The flags do look valid. But, avoid misinterpreting
+    buggy MariaDB 10.1 format flags for
+    PAGE_COMPRESSED=1 PAGE_COMPRESSION_LEVEL={0,2,3}
+    as valid-looking PAGE_SSIZE if this is known to be
+    an .ibd file and we are using the default innodb_page_size=16k. */
+    return(ssize == 0 || !is_ibd || srv_page_size != UNIV_PAGE_SIZE_ORIG);
+  }
+
+#ifndef UNIV_INNOCHECKSUM
+  MY_ATTRIBUTE((warn_unused_result))
+  /** Create a tablespace in fil_system.
+  @param id         tablespace identifier
+  @param flags      tablespace flags
+  @param purpose    tablespace purpose
+  @param crypt_data encryption information
+  @param mode       encryption mode
+  @param opened     true if space files are opened
+  @return pointer to created tablespace, to be filled in with add()
+  @retval nullptr on failure (such as when the same tablespace exists) */
+  static fil_space_t *create(uint32_t id, uint32_t flags,
+                             fil_type_t purpose, fil_space_crypt_t *crypt_data,
+                             fil_encryption_t mode= FIL_ENCRYPTION_DEFAULT,
+                             bool opened= false);
+
+  MY_ATTRIBUTE((warn_unused_result))
+  /** Acquire a tablespace reference.
+  @param id      tablespace identifier
+  @return tablespace
+  @retval nullptr if the tablespace is missing or inaccessible */
+  static fil_space_t *get(uint32_t id);
+  /** Acquire a tablespace reference for writing.
+  @param id      tablespace identifier
+  @return tablespace
+  @retval nullptr if the tablespace is missing or inaccessible */
+  static fil_space_t *get_for_write(uint32_t id);
+
+  /** Add/remove the free page in the freed ranges list.
+  @param[in] offset     page number to be added
+  @param[in] free       true if page to be freed */
+  void free_page(uint32_t offset, bool add=true)
+  {
+    std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+    if (add)
+      return freed_ranges.add_value(offset);
+
+    if (freed_ranges.empty())
+      return;
+
+    return freed_ranges.remove_value(offset);
+  }
+
+  /** Add the range of freed pages */
+  void add_free_ranges(range_set ranges)
+  {
+    std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+    freed_ranges= std::move(ranges);
+  }
+
+  /** Add the set of freed page ranges */
+  void add_free_range(const range_t range)
+  {
+    freed_ranges.add_range(range);
+  }
+
+  /** Set the tablespace size in pages */
+  void set_sizes(uint32_t s)
+  {
+    ut_ad(id ? !size : (size >= s));
+    size= s; committed_size= s;
+  }
+
+  /** Update committed_size in mtr_t::commit() */
+  void set_committed_size() { committed_size= size; }
+
+  /** @return the last persisted page number */
+  uint32_t last_page_number() const { return committed_size - 1; }
+
+  /** @return the size in pages (0 if unreadable) */
+  inline uint32_t get_size();
+
+  /** Read or write data.
+  @param type     I/O context
+  @param offset   offset in bytes
+  @param len      number of bytes
+  @param buf      the data to be read or written
+  @param bpage    buffer block (for type.is_async() completion callback)
+  @return status and file descriptor */
+  fil_io_t io(const IORequest &type, os_offset_t offset, size_t len,
+              void *buf, buf_page_t *bpage= nullptr);
+  /** Flush pending writes from the file system cache to the file. */
+  template<bool have_reference> inline void flush();
+  /** Flush pending writes from the file system cache to the file. */
+  void flush_low();
+
+  /** Read the first page of a data file.
+  @return whether the page was found valid */
+  bool read_page0();
+
+  /** Determine the next tablespace for encryption key rotation.
+  @param space    current tablespace (nullptr to start from the beginning)
+  @param recheck  whether the removal condition needs to be rechecked after
+                  encryption parameters were changed
+  @param encrypt  expected state of innodb_encrypt_tables
+  @return the next tablespace
+  @retval nullptr upon reaching the end of the iteration */
+  static space_list_t::iterator next(space_list_t::iterator space,
+                                     bool recheck, bool encrypt);
+
+#ifdef UNIV_DEBUG
+  bool is_latched() const { return latch_count != 0; }
+#endif
+  bool is_owner() const { return latch_owner == pthread_self(); }
+  /** Acquire the allocation latch in exclusive mode */
+  void x_lock()
+  {
+    latch.wr_lock(SRW_LOCK_CALL);
+    ut_ad(!latch_owner);
+    latch_owner= pthread_self();
+    ut_ad(!latch_count.fetch_add(1));
+  }
+  /** Release the allocation latch from exclusive mode */
+  void x_unlock()
+  {
+    ut_ad(latch_count.fetch_sub(1) == 1);
+    ut_ad(latch_owner == pthread_self());
+    latch_owner= 0;
+    latch.wr_unlock();
+  }
+  /** Acquire the allocation latch in shared mode */
+  void s_lock()
+  {
+    ut_ad(!is_owner());
+    latch.rd_lock(SRW_LOCK_CALL);
+    ut_ad(!latch_owner);
+    ut_d(latch_count.fetch_add(1));
+  }
+  /** Release the allocation latch from shared mode */
+  void s_unlock()
+  {
+    ut_ad(latch_count.fetch_sub(1));
+    ut_ad(!latch_owner);
+    latch.rd_unlock();
+  }
+
+  typedef span<const char> name_type;
+
+  /** @return the tablespace name (databasename/tablename) */
+  name_type name() const;
+
+private:
+  /** @return whether the file is usable for io() */
+  ATTRIBUTE_COLD bool prepare_acquired();
+  /** @return whether the file is usable for io() */
+  ATTRIBUTE_COLD bool acquire_and_prepare();
+#endif /*!UNIV_INNOCHECKSUM */
+};
+
+#ifndef UNIV_INNOCHECKSUM
+/** File node of a tablespace or the log data space */
+struct fil_node_t final
+{
+  /** tablespace containing this file */
+  fil_space_t *space;
+  /** file name; protected by fil_system.mutex and exclusive log_sys.latch */
+  char *name;
+  /** file handle */
+  pfs_os_file_t handle;
+  /** whether the file is on non-rotational media (SSD) */
+  unsigned on_ssd:1;
+  /** how to write page_compressed tables
+  (0=do not punch holes but write minimal amount of data, 1=punch holes,
+  2=always write the same amount; thinly provisioned storage will compress) */
+  unsigned punch_hole:2;
+  /** whether this file could use atomic write */
+  unsigned atomic_write:1;
+  /** whether the file actually is a raw device or disk partition */
+  unsigned is_raw_disk:1;
+  /** whether the tablespace discovery is being deferred during crash
+  recovery due to incompletely written page 0 */
+  unsigned deferred:1;
+
+  /** size of the file in database pages (0 if not known yet);
+  the possible last incomplete megabyte may be ignored if space->id == 0 */
+  uint32_t size;
+  /** initial size of the file in database pages;
+  FIL_IBD_FILE_INITIAL_SIZE by default */
+  uint32_t init_size;
+  /** maximum size of the file in database pages (0 if unlimited) */
+  uint32_t max_size;
+  /** whether the file is currently being extended */
+  Atomic_relaxed<bool> being_extended;
+  /** link to other files in this tablespace */
+  UT_LIST_NODE_T(fil_node_t) chain;
+
+  /** Filesystem block size */
+  ulint block_size;
+
+  /** @return whether this file is open */
+  bool is_open() const { return handle != OS_FILE_CLOSED; }
+
+  /** Read the first page of a data file.
+  @return whether the page was found valid */
+  bool read_page0();
+
+  /** Determine some file metadata when creating or reading the file.
+  @param file   the file that is being created, or OS_FILE_CLOSED */
+  void find_metadata(os_file_t file= OS_FILE_CLOSED
+#ifndef _WIN32
+                     , bool create= false, struct stat *statbuf= nullptr
+#endif
+                     );
+
+  /** Close the file handle. */
+  void close();
+  /** Same as close() but returns file handle instead of closing it. */
+  pfs_os_file_t detach() MY_ATTRIBUTE((warn_unused_result));
+  /** Prepare to free a file from fil_system.
+  @param detach_handle whether to detach instead of closing a handle
+  @return detached handle or OS_FILE_CLOSED */
+  inline pfs_os_file_t close_to_free(bool detach_handle= false);
+
+  /** Update the data structures on write completion */
+  inline void complete_write();
+
+private:
+  /** Does stuff common for close() and detach() */
+  void prepare_to_close_or_detach();
+};
+
+inline bool fil_space_t::use_doublewrite() const
+{
+  return !UT_LIST_GET_FIRST(chain)->atomic_write && srv_use_doublewrite_buf &&
+    buf_dblwr.is_created();
+}
+
+inline void fil_space_t::set_imported()
+{
+  ut_ad(purpose == FIL_TYPE_IMPORT);
+  purpose= FIL_TYPE_TABLESPACE;
+  UT_LIST_GET_FIRST(chain)->find_metadata();
+}
+
+inline bool fil_space_t::is_rotational() const
+{
+  for (const fil_node_t *node= UT_LIST_GET_FIRST(chain); node;
+       node= UT_LIST_GET_NEXT(chain, node))
+    if (!node->on_ssd)
+      return true;
+  return false;
+}
+
+/** Common InnoDB file extensions */
+enum ib_extention {
+	NO_EXT = 0,
+	IBD = 1,
+	ISL = 2,
+	CFG = 3
+};
+extern const char* dot_ext[];
+#define DOT_IBD dot_ext[IBD]
+#define DOT_ISL dot_ext[ISL]
+#define DOT_CFG dot_ext[CFG]
+
+/** When mariadbd is run, the default directory "." is the mysqld datadir,
+but in the MariaDB Embedded Server Library and mysqlbackup it is not the default
+directory, and we must set the base file path explicitly */
+extern const char*	fil_path_to_mysql_datadir;
+#else
+# include "univ.i"
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Initial size of a single-table tablespace in pages */
+#define FIL_IBD_FILE_INITIAL_SIZE	4U
+
+/** 'null' (undefined) page offset in the context of file spaces */
+#define	FIL_NULL	ULINT32_UNDEFINED
+
+
+#define FIL_ADDR_PAGE	0U	/* first in address is the page offset */
+#define	FIL_ADDR_BYTE	4U	/* then comes 2-byte byte offset within page*/
+#define	FIL_ADDR_SIZE	6U	/* address size is 6 bytes */
+
+/** File space address */
+struct fil_addr_t {
+  /** page number within a tablespace */
+  uint32_t page;
+  /** byte offset within the page */
+  uint16_t boffset;
+};
+
+/** The byte offsets on a file page for various variables @{ */
+#define FIL_PAGE_SPACE_OR_CHKSUM 0	/*!< in < MySQL-4.0.14 space id the
+					page belongs to (== 0) but in later
+					versions the 'new' checksum of the
+					page */
+#define FIL_PAGE_OFFSET		4U	/*!< page offset inside space */
+#define FIL_PAGE_PREV		8U	/*!< if there is a 'natural'
+					predecessor of the page, its
+					offset.  Otherwise FIL_NULL.
+					This field is not set on BLOB
+					pages, which are stored as a
+					singly-linked list.  See also
+					FIL_PAGE_NEXT. */
+#define FIL_PAGE_NEXT		12U	/*!< if there is a 'natural' successor
+					of the page, its offset.
+					Otherwise FIL_NULL.
+					B-tree index pages
+					(FIL_PAGE_TYPE contains FIL_PAGE_INDEX)
+					on the same PAGE_LEVEL are maintained
+					as a doubly linked list via
+					FIL_PAGE_PREV and FIL_PAGE_NEXT
+					in the collation order of the
+					smallest user record on each page. */
+#define FIL_PAGE_LSN		16U	/*!< lsn of the end of the newest
+					modification log record to the page */
+#define	FIL_PAGE_TYPE		24U	/*!< file page type: FIL_PAGE_INDEX,...,
+					2 bytes.
+
+					The contents of this field can only
+					be trusted in the following case:
+					if the page is an uncompressed
+					B-tree index page, then it is
+					guaranteed that the value is
+					FIL_PAGE_INDEX.
+					The opposite does not hold.
+
+					In tablespaces created by
+					MySQL/InnoDB 5.1.7 or later, the
+					contents of this field is valid
+					for all uncompressed pages. */
+
+/** For the first page in a system tablespace data file(ibdata*, not *.ibd):
+the file has been flushed to disk at least up to this lsn
+For other pages of tablespaces not in innodb_checksum_algorithm=full_crc32
+format: 32-bit key version used to encrypt the page + 32-bit checksum
+or 64 bits of zero if no encryption */
+#define FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION 26U
+
+/** This overloads FIL_PAGE_FILE_FLUSH_LSN for RTREE Split Sequence Number */
+#define	FIL_RTREE_SPLIT_SEQ_NUM	FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+
+/** Start of the page_compressed content */
+#define FIL_PAGE_COMP_ALGO	FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+
+/** starting from 4.1.x this contains the space id of the page */
+#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID  34U
+
+#define FIL_PAGE_SPACE_ID  FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID
+
+#define FIL_PAGE_DATA		38U	/*!< start of the data on the page */
+
+/** 32-bit key version used to encrypt the page in full_crc32 format.
+For non-encrypted page, it contains 0. */
+#define FIL_PAGE_FCRC32_KEY_VERSION	0
+
+/** page_compressed without innodb_checksum_algorithm=full_crc32 @{ */
+/** Number of bytes used to store actual payload data size on
+page_compressed pages when not using full_crc32. */
+#define FIL_PAGE_COMP_SIZE		0
+
+/** Number of bytes for FIL_PAGE_COMP_SIZE */
+#define FIL_PAGE_COMP_METADATA_LEN		2
+
+/** Number of bytes used to store actual compression method
+for encrypted tables when not using full_crc32. */
+#define FIL_PAGE_ENCRYPT_COMP_ALGO		2
+
+/** Extra header size for encrypted page_compressed pages when
+not using full_crc32 */
+#define FIL_PAGE_ENCRYPT_COMP_METADATA_LEN	4
+/* @} */
+
+/** File page trailer @{ */
+#define FIL_PAGE_END_LSN_OLD_CHKSUM 8	/*!< the low 4 bytes of this are used
+					to store the page checksum, the
+					last 4 bytes should be identical
+					to the last 4 bytes of FIL_PAGE_LSN */
+#define FIL_PAGE_DATA_END	8	/*!< size of the page trailer */
+
+/** Store the last 4 bytes of FIL_PAGE_LSN */
+#define FIL_PAGE_FCRC32_END_LSN 8
+
+/** Store crc32 checksum at the end of the page */
+#define FIL_PAGE_FCRC32_CHECKSUM	4
+/* @} */
+
+/** File page types (values of FIL_PAGE_TYPE) @{ */
+/** page_compressed, encrypted=YES (not used for full_crc32) */
+constexpr uint16_t FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED= 37401;
+/** page_compressed (not used for full_crc32) */
+constexpr uint16_t FIL_PAGE_PAGE_COMPRESSED= 34354;
+/** B-tree index page */
+constexpr uint16_t FIL_PAGE_INDEX= 17855;
+/** R-tree index page (SPATIAL INDEX) */
+constexpr uint16_t FIL_PAGE_RTREE= 17854;
+/** Undo log page */
+constexpr uint16_t FIL_PAGE_UNDO_LOG= 2;
+/** Index node (of file-in-file metadata) */
+constexpr uint16_t FIL_PAGE_INODE= 3;
+/** Insert buffer free list */
+constexpr uint16_t FIL_PAGE_IBUF_FREE_LIST= 4;
+/** Freshly allocated page */
+constexpr uint16_t FIL_PAGE_TYPE_ALLOCATED= 0;
+/** Change buffer bitmap (pages n*innodb_page_size+1) */
+constexpr uint16_t FIL_PAGE_IBUF_BITMAP= 5;
+/** System page */
+constexpr uint16_t FIL_PAGE_TYPE_SYS= 6;
+/** Transaction system data */
+constexpr uint16_t FIL_PAGE_TYPE_TRX_SYS= 7;
+/** Tablespace header (page 0) */
+constexpr uint16_t FIL_PAGE_TYPE_FSP_HDR= 8;
+/** Extent descriptor page (pages n*innodb_page_size, except 0) */
+constexpr uint16_t FIL_PAGE_TYPE_XDES= 9;
+/** Uncompressed BLOB page */
+constexpr uint16_t FIL_PAGE_TYPE_BLOB= 10;
+/** First ROW_FORMAT=COMPRESSED BLOB page */
+constexpr uint16_t FIL_PAGE_TYPE_ZBLOB= 11;
+/** Subsequent ROW_FORMAT=COMPRESSED BLOB page */
+constexpr uint16_t FIL_PAGE_TYPE_ZBLOB2= 12;
+/** In old tablespaces, garbage in FIL_PAGE_TYPE is replaced with this
+value when flushing pages. */
+constexpr uint16_t FIL_PAGE_TYPE_UNKNOWN= 13;
+
+/* File page types introduced in MySQL 5.7, not supported in MariaDB */
+//constexpr uint16_t FIL_PAGE_COMPRESSED = 14;
+//constexpr uint16_t FIL_PAGE_ENCRYPTED = 15;
+//constexpr uint16_t FIL_PAGE_COMPRESSED_AND_ENCRYPTED = 16;
+//constexpr FIL_PAGE_ENCRYPTED_RTREE = 17;
+/** Clustered index root page after instant ADD COLUMN */
+constexpr uint16_t FIL_PAGE_TYPE_INSTANT= 18;
+
+/** Used by i_s.cc to index into the text description.
+Note: FIL_PAGE_TYPE_INSTANT maps to the same as FIL_PAGE_INDEX. */
+constexpr uint16_t FIL_PAGE_TYPE_LAST= FIL_PAGE_TYPE_UNKNOWN;
+
+/** Set in FIL_PAGE_TYPE for full_crc32 pages in page_compressed format.
+If the flag is set, then the following holds for the remaining bits
+of FIL_PAGE_TYPE:
+Bits 0..7 will contain the compressed page size in bytes.
+Bits 8..14 are reserved and must be 0. */
+constexpr uint16_t FIL_PAGE_COMPRESS_FCRC32_MARKER= 15;
+/* @} */
+
+/** @return whether the page type is B-tree or R-tree index */
+inline bool fil_page_type_is_index(uint16_t page_type)
+{
+	switch (page_type) {
+	case FIL_PAGE_TYPE_INSTANT:
+	case FIL_PAGE_INDEX:
+	case FIL_PAGE_RTREE:
+		return(true);
+	}
+	return(false);
+}
+
+/** Check whether the page is index page (either regular Btree index or Rtree
+index */
+#define fil_page_index_page_check(page)                         \
+        fil_page_type_is_index(fil_page_get_type(page))
+
+/** Get the file page type.
+@param[in]	page	file page
+@return page type */
+inline uint16_t fil_page_get_type(const byte *page)
+{
+  return mach_read_from_2(my_assume_aligned<2>(page + FIL_PAGE_TYPE));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/** Number of pending tablespace flushes */
+extern Atomic_counter<ulint> fil_n_pending_tablespace_flushes;
+
+/** Look up a tablespace.
+The caller should hold an InnoDB table lock or a MDL that prevents
+the tablespace from being dropped during the operation,
+or the caller should be in single-threaded crash recovery mode
+(no user connections that could drop tablespaces).
+Normally, fil_space_t::get() should be used instead.
+@param[in]	id	tablespace ID
+@return tablespace, or NULL if not found */
+fil_space_t *fil_space_get(uint32_t id)
+  MY_ATTRIBUTE((warn_unused_result));
+
+/** The tablespace memory cache */
+struct fil_system_t
+{
+  /**
+    Constructor.
+
+    Some members may require late initialisation, thus we just mark object as
+    uninitialised. Real initialisation happens in create().
+  */
+  fil_system_t() : m_initialised(false) {}
+
+  bool is_initialised() const { return m_initialised; }
+
+  /**
+    Create the file system interface at database start.
+
+    @param[in] hash_size	hash table size
+  */
+  void create(ulint hash_size);
+
+  /** Close the file system interface at shutdown */
+  void close();
+
+private:
+  bool m_initialised;
+
+  /** Points to the last opened space in space_list. Protected with
+  fil_system.mutex. */
+  fil_space_t *space_list_last_opened= nullptr;
+
+#ifdef __linux__
+  /** available block devices that reside on non-rotational storage */
+  std::vector<dev_t> ssd;
+public:
+  /** @return whether a file system device is on non-rotational storage */
+  bool is_ssd(dev_t dev) const
+  {
+    /* Linux seems to allow up to 15 partitions per block device.
+    If the detected ssd carries "partition number 0" (it is the whole device),
+    compare the candidate file system number without the partition number. */
+    for (const auto s : ssd)
+      if (dev == s || (dev & ~15U) == s)
+        return true;
+    return false;
+  }
+#endif
+public:
+  /** Detach a tablespace from the cache and close the files.
+  @param space tablespace
+  @param detach_handle whether to detach the handle, instead of closing
+  @return detached handle
+  @retval OS_FILE_CLOSED if no handle was detached */
+  pfs_os_file_t detach(fil_space_t *space, bool detach_handle= false);
+
+  /** the mutex protecting most data fields, and some fields of fil_space_t */
+  mysql_mutex_t mutex;
+	fil_space_t*	sys_space;	/*!< The innodb_system tablespace */
+	fil_space_t*	temp_space;	/*!< The innodb_temporary tablespace */
+  /** Map of fil_space_t::id to fil_space_t* */
+  hash_table_t spaces;
+  /** tablespaces for which fil_space_t::needs_flush() holds */
+  sized_ilist<fil_space_t, unflushed_spaces_tag_t> unflushed_spaces;
+  /** number of currently open files; protected by mutex */
+  ulint n_open;
+  /** last time we noted n_open exceeding the limit; protected by mutex */
+  time_t n_open_exceeded_time;
+  /** maximum persistent tablespace id that has ever been assigned */
+  uint32_t max_assigned_id;
+  /** nonzero if fil_node_open_file_low() should avoid moving the tablespace
+  to the end of space_list, for FIFO policy of try_to_close() */
+  ulint freeze_space_list;
+  /** List of all file spaces, opened spaces should be at the top of the list
+  to optimize try_to_close() execution. Protected with fil_system.mutex. */
+  ilist<fil_space_t, space_list_tag_t> space_list;
+  /** list of all tablespaces for which a FILE_MODIFY record has been written
+  since the latest redo log checkpoint.
+  Protected only by exclusive log_sys.latch. */
+  ilist<fil_space_t, named_spaces_tag_t> named_spaces;
+
+  /** list of all ENCRYPTED=DEFAULT tablespaces that need
+  to be converted to the current value of innodb_encrypt_tables */
+  ilist<fil_space_t, default_encrypt_tag_t> default_encrypt_tables;
+
+  /** whether fil_space_t::create() has issued a warning about
+  potential space_id reuse */
+  bool space_id_reuse_warned;
+
+  /** Add the file to the end of opened spaces list in
+  fil_system.space_list, so that fil_space_t::try_to_close() should close
+  it as a last resort.
+  @param space space to add */
+  void add_opened_last_to_space_list(fil_space_t *space);
+
+  /** Move the file to the end of opened spaces list in
+  fil_system.space_list, so that fil_space_t::try_to_close() should close
+  it as a last resort.
+  @param space space to move */
+  inline void move_opened_last_to_space_list(fil_space_t *space)
+  {
+    /* In the case when several files of the same space are added in a
+    row, there is no need to remove and add a space to the same position
+    in space_list. It can be for system or temporary tablespaces. */
+    if (freeze_space_list || space_list_last_opened == space)
+      return;
+
+    space_list.erase(space_list_t::iterator(space));
+    add_opened_last_to_space_list(space);
+  }
+
+  /** Move closed file last in fil_system.space_list, so that
+  fil_space_t::try_to_close() iterates opened files first in FIFO order,
+  i.e. first opened, first closed.
+  @param space space to move */
+  void move_closed_last_to_space_list(fil_space_t *space)
+  {
+    if (UNIV_UNLIKELY(freeze_space_list))
+      return;
+
+    space_list_t::iterator s= space_list_t::iterator(space);
+
+    if (space_list_last_opened == space)
+    {
+      ut_ad(s != space_list.begin());
+      space_list_t::iterator prev= s;
+      space_list_last_opened= &*--prev;
+    }
+
+    space_list.erase(s);
+    space_list.push_back(*space);
+  }
+
+  /** Return the next tablespace from default_encrypt_tables list.
+  @param space   previous tablespace (nullptr to start from the start)
+  @param recheck whether the removal condition needs to be rechecked after
+  the encryption parameters were changed
+  @param encrypt expected state of innodb_encrypt_tables
+  @return the next tablespace to process (n_pending_ops incremented)
+  @retval fil_system.temp_space if there is no work to do
+  @retval nullptr upon reaching the end of the iteration */
+  inline fil_space_t* default_encrypt_next(fil_space_t *space, bool recheck,
+                                           bool encrypt);
+
+  /** Extend all open data files to the recovered size */
+  ATTRIBUTE_COLD void extend_to_recv_size();
+
+  /** Determine if a tablespace associated with a file name exists.
+  @param path   tablespace file name to look for
+  @return a matching tablespace */
+  inline fil_space_t *find(const char *path) const;
+};
+
+/** The tablespace memory cache. */
+extern fil_system_t	fil_system;
+
+inline void fil_space_t::reacquire()
+{
+  ut_d(uint32_t n=) n_pending.fetch_add(1, std::memory_order_relaxed);
+#ifdef SAFE_MUTEX
+  if (mysql_mutex_is_owner(&fil_system.mutex)) return;
+  ut_ad(n & PENDING);
+  ut_ad(UT_LIST_GET_FIRST(chain)->is_open());
+#endif /* SAFE_MUTEX */
+}
+
+/** Note that operations on the tablespace must stop. */
+inline void fil_space_t::set_stopping()
+{
+  mysql_mutex_assert_owner(&fil_system.mutex);
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+  static_assert(STOPPING_WRITES == 1U << 30, "compatibility");
+  __asm__ __volatile__("lock btsl $30, %0" : "+m" (n_pending));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+  static_assert(STOPPING_WRITES == 1U << 30, "compatibility");
+  _interlockedbittestandset(reinterpret_cast<volatile long*>(&n_pending), 30);
+#else
+  n_pending.fetch_or(STOPPING_WRITES, std::memory_order_relaxed);
+#endif
+}
+
+inline void fil_space_t::clear_stopping()
+{
+  mysql_mutex_assert_owner(&fil_system.mutex);
+  static_assert(STOPPING_WRITES == 1U << 30, "compatibility");
+  ut_d(auto n=) n_pending.fetch_sub(STOPPING_WRITES, std::memory_order_relaxed);
+  ut_ad((n & STOPPING) == STOPPING_WRITES);
+}
+
+/** Flush pending writes from the file system cache to the file. */
+template<bool have_reference> inline void fil_space_t::flush()
+{
+  mysql_mutex_assert_not_owner(&fil_system.mutex);
+  ut_ad(!have_reference || (pending() & PENDING));
+  ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_IMPORT);
+  if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)
+  {
+    ut_ad(!is_in_unflushed_spaces);
+    ut_ad(!needs_flush());
+  }
+  else if (have_reference)
+    flush_low();
+  else
+  {
+    if (!(acquire_low(STOPPING | CLOSING) & (STOPPING | CLOSING)))
+    {
+      flush_low();
+      release();
+    }
+  }
+}
+
+/** @return the size in pages (0 if unreadable) */
+inline uint32_t fil_space_t::get_size()
+{
+  if (!size)
+  {
+    mysql_mutex_lock(&fil_system.mutex);
+    read_page0();
+    mysql_mutex_unlock(&fil_system.mutex);
+  }
+  return size;
+}
+
+#include "fil0crypt.h"
+
+/*******************************************************************//**
+Assigns a new space id for a new single-table tablespace. This works simply by
+incrementing the global counter. If 4 billion id's is not enough, we may need
+to recycle id's.
+@return true if assigned, false if not */
+bool fil_assign_new_space_id(uint32_t *space_id);
+
+/** Frees a space object from the tablespace memory cache.
+Closes the files in the chain but does not delete them.
+There must not be any pending i/o's or flushes on the files.
+@param id          tablespace identifier
+@param x_latched   whether the caller holds exclusive fil_space_t::latch
+@return true if success */
+bool fil_space_free(uint32_t id, bool x_latched);
+
+/** Set the recovered size of a tablespace in pages.
+@param	id	tablespace ID
+@param	size	recovered size in pages
+@param	flags	tablespace flags */
+void fil_space_set_recv_size_and_flags(uint32_t id, uint32_t size,
+                                       uint32_t flags);
+
+/*******************************************************************//**
+Sets the max tablespace id counter if the given number is bigger than the
+previous value. */
+void fil_set_max_space_id_if_bigger(uint32_t max_id);
+
+MY_ATTRIBUTE((warn_unused_result))
+/** Delete a tablespace and associated .ibd file.
+@param id    tablespace identifier
+@return detached file handle (to be closed by the caller)
+@return	OS_FILE_CLOSED if no file existed */
+pfs_os_file_t fil_delete_tablespace(uint32_t id);
+
+/** Close a single-table tablespace on failed IMPORT TABLESPACE.
+The tablespace must be cached in the memory cache.
+Free all pages used by the tablespace. */
+void fil_close_tablespace(uint32_t id);
+
+/*******************************************************************//**
+Allocates and builds a file name from a path, a table or tablespace name
+and a suffix. The string must be freed by caller with ut_free().
+@param[in] path NULL or the directory path or the full path and filename.
+@param[in] name {} if path is full, or Table/Tablespace name
+@param[in] ext the file extension to use
+@param[in] trim_name true if the last name on the path should be trimmed.
+@return own: file name */
+char* fil_make_filepath(const char *path, const fil_space_t::name_type &name,
+                        ib_extention ext, bool trim_name);
+
+char *fil_make_filepath(const char* path, const table_name_t name,
+                        ib_extention suffix, bool strip_name);
+
+/** Create a tablespace file.
+@param[in]	space_id	Tablespace ID
+@param[in]	name		Tablespace name in dbname/tablename format.
+@param[in]	path		Path and filename of the datafile to create.
+@param[in]	flags		Tablespace flags
+@param[in]	size		Initial size of the tablespace file in pages,
+must be >= FIL_IBD_FILE_INITIAL_SIZE
+@param[in]	mode		MariaDB encryption mode
+@param[in]	key_id		MariaDB encryption key_id
+@param[out]	err		DB_SUCCESS or error code
+@return	the created tablespace
+@retval	NULL	on error */
+fil_space_t*
+fil_ibd_create(
+	uint32_t	space_id,
+	const table_name_t name,
+	const char*	path,
+	uint32_t	flags,
+	uint32_t	size,
+	fil_encryption_t mode,
+	uint32_t	key_id,
+	dberr_t*	err)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Try to adjust FSP_SPACE_FLAGS if they differ from the expectations.
+(Typically when upgrading from MariaDB 10.1.0..10.1.20.)
+@param[in,out]	space		tablespace
+@param[in]	flags		desired tablespace flags */
+void fsp_flags_try_adjust(fil_space_t *space, uint32_t flags);
+
+/********************************************************************//**
+Tries to open a single-table tablespace and optionally checks the space id is
+right in it. If does not succeed, prints an error message to the .err log. This
+function is used to open a tablespace when we start up mysqld, and also in
+IMPORT TABLESPACE.
+NOTE that we assume this operation is used either at the database startup
+or under the protection of dict_sys.latch, so that two users cannot
+race here. This operation does not leave the file associated with the
+tablespace open, but closes it after we have looked at the space id in it.
+
+If the validate boolean is set, we read the first page of the file and
+check that the space id in the file is what we expect. We assume that
+this function runs much faster if no check is made, since accessing the
+file inode probably is much faster (the OS caches them) than accessing
+the first page of the file.  This boolean may be initially false, but if
+a remote tablespace is found it will be changed to true.
+
+@param[in]	validate	0=maybe missing, 1=do not validate, 2=validate
+@param[in]	purpose		FIL_TYPE_TABLESPACE or FIL_TYPE_TEMPORARY
+@param[in]	id		tablespace ID
+@param[in]	flags		expected FSP_SPACE_FLAGS
+@param[in]	name		table name
+If file-per-table, it is the table name in the databasename/tablename format
+@param[in]	path_in		expected filepath, usually read from dictionary
+@param[out]	err		DB_SUCCESS or error code
+@return	tablespace
+@retval	NULL	if the tablespace could not be opened */
+fil_space_t*
+fil_ibd_open(
+	unsigned		validate,
+	fil_type_t		purpose,
+	uint32_t		id,
+	uint32_t		flags,
+	fil_space_t::name_type	name,
+	const char*		path_in,
+	dberr_t*		err = NULL)
+	MY_ATTRIBUTE((warn_unused_result));
+
+enum fil_load_status {
+	/** The tablespace file(s) were found and valid. */
+	FIL_LOAD_OK,
+	/** The name no longer matches space_id */
+	FIL_LOAD_ID_CHANGED,
+	/** The file(s) were not found */
+	FIL_LOAD_NOT_FOUND,
+	/** The file(s) were not valid */
+	FIL_LOAD_INVALID,
+	/** The tablespace file was deferred to open */
+	FIL_LOAD_DEFER
+};
+
+/** Open a single-file tablespace and add it to the InnoDB data structures.
+@param[in]	space_id	tablespace ID
+@param[in]	filename	path/to/databasename/tablename.ibd
+@param[out]	space		the tablespace, or NULL on error
+@return status of the operation */
+enum fil_load_status
+fil_ibd_load(uint32_t space_id, const char *filename, fil_space_t *&space)
+  MY_ATTRIBUTE((warn_unused_result));
+
+/** Determine if a matching tablespace exists in the InnoDB tablespace
+memory cache. Note that if we have not done a crash recovery at the database
+startup, there may be many tablespaces which are not yet in the memory cache.
+@param[in]	id		Tablespace ID
+@param[in]	table_flags	table flags
+@return the tablespace
+@retval	NULL	if no matching tablespace exists in the memory cache */
+fil_space_t *fil_space_for_table_exists_in_mem(uint32_t id,
+                                               uint32_t table_flags);
+
+/** Try to extend a tablespace if it is smaller than the specified size.
+@param[in,out]	space	tablespace
+@param[in]	size	desired size in pages
+@return whether the tablespace is at least as big as requested */
+bool fil_space_extend(fil_space_t *space, uint32_t size);
+
+/** Flush to disk the writes in file spaces of the given type
+possibly cached by the OS. */
+void fil_flush_file_spaces();
+/******************************************************************//**
+Checks the consistency of the tablespace cache.
+@return true if ok */
+bool fil_validate();
+/*********************************************************************//**
+Sets the file page type. */
+void
+fil_page_set_type(
+/*==============*/
+	byte*	page,	/*!< in/out: file page */
+	ulint	type);	/*!< in: type */
+
+/********************************************************************//**
+Delete the tablespace file and any related files like .cfg.
+This should not be called for temporary tables. */
+void
+fil_delete_file(
+/*============*/
+	const char*	path);	/*!< in: filepath of the ibd tablespace */
+
+/** Look up a tablespace.
+@param tablespace identifier
+@return tablespace
+@retval nullptr if not found */
+fil_space_t *fil_space_get_by_id(uint32_t id);
+
+/** Note that a non-predefined persistent tablespace has been modified
+by redo log.
+@param[in,out]	space	tablespace */
+void
+fil_names_dirty(
+	fil_space_t*	space);
+
+
+bool fil_comp_algo_loaded(ulint comp_algo);
+
+/** On a log checkpoint, reset fil_names_dirty_and_write() flags
+and write out FILE_MODIFY if needed, and write FILE_CHECKPOINT.
+@param lsn  checkpoint LSN
+@return current LSN */
+lsn_t fil_names_clear(lsn_t lsn);
+
+#ifdef UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH
+void test_make_filepath();
+#endif /* UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH */
+
+/** Determine the block size of the data file.
+@param[in]	space		tablespace
+@param[in]	offset		page number
+@return	block size */
+ulint fil_space_get_block_size(const fil_space_t* space, unsigned offset);
+
+/** Check whether encryption key found
+@param crypt_data Encryption data
+@param f_name     File name
+@return encryption key found */
+bool fil_crypt_check(fil_space_crypt_t *crypt_data, const char *f_name);
+
+#endif /* UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/fil0pagecompress.h b/storage/innobase/include/fil0pagecompress.h
new file mode 100644
index 00000000..2927da3c
--- /dev/null
+++ b/storage/innobase/include/fil0pagecompress.h
@@ -0,0 +1,57 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#ifndef fil0pagecompress_h
+#define fil0pagecompress_h
+
+#include "fsp0fsp.h"
+
+/******************************************************************//**
+@file include/fil0pagecompress.h
+Helper functions for extracting/storing page compression and
+atomic writes information to table space.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+/** Compress a page_compressed page before writing to a data file.
+@param[in]	buf		page to be compressed
+@param[out]	out_buf		compressed page
+@param[in]	flags		tablespace flags
+@param[in]	block_size	file system block size
+@param[in]	encrypted	whether the page will be subsequently encrypted
+@return actual length of compressed page
+@retval	0	if the page was not compressed */
+ulint fil_page_compress(
+	const byte*	buf,
+	byte*		out_buf,
+	uint32_t	flags,
+	ulint		block_size,
+	bool		encrypted)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Decompress a page that may be subject to page_compressed compression.
+@param[in,out]	tmp_buf		temporary buffer (of innodb_page_size)
+@param[in,out]	buf		compressed page buffer
+@param[in]	flags		tablespace flags
+@return size of the compressed data
+@retval	0		if decompression failed
+@retval	srv_page_size	if the page was not compressed */
+ulint fil_page_decompress(byte *tmp_buf, byte *buf, uint32_t flags)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+#endif
diff --git a/storage/innobase/include/fsp0file.h b/storage/innobase/include/fsp0file.h
new file mode 100644
index 00000000..67e79f1a
--- /dev/null
+++ b/storage/innobase/include/fsp0file.h
@@ -0,0 +1,509 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0file.h
+Tablespace data file implementation.
+
+Created 2013-7-26 by Kevin Lewis
+*******************************************************/
+
+#ifndef fsp0file_h
+#define fsp0file_h
+
+#include "mem0mem.h"
+#include "os0file.h"
+#include "fil0fil.h"
+
+/** Types of raw partitions in innodb_data_file_path */
+enum device_t {
+	SRV_NOT_RAW = 0,	/*!< Not a raw partition */
+	SRV_NEW_RAW,		/*!< A 'newraw' partition, only to be
+				initialized */
+	SRV_OLD_RAW		/*!< An initialized raw partition */
+};
+
+/** Data file control information. */
+class Datafile {
+
+	friend class Tablespace;
+	friend class SysTablespace;
+
+public:
+
+	Datafile()
+		:
+		m_filepath(),
+		m_filename(),
+		m_handle(),
+		m_open_flags(OS_FILE_OPEN),
+		m_size(),
+		m_order(),
+		m_type(SRV_NOT_RAW),
+		m_space_id(UINT32_MAX),
+		m_flags(),
+		m_exists(),
+		m_is_valid(),
+		m_first_page(),
+		m_last_os_error(),
+		m_file_info()
+	{
+		/* No op */
+	}
+
+	Datafile(uint32_t flags, uint32_t size, ulint order)
+		:
+		m_filepath(),
+		m_filename(),
+		m_handle(),
+		m_open_flags(OS_FILE_OPEN),
+		m_size(size),
+		m_order(order),
+		m_type(SRV_NOT_RAW),
+		m_space_id(UINT32_MAX),
+		m_flags(flags),
+		m_exists(),
+		m_is_valid(),
+		m_first_page(),
+		m_last_os_error(),
+		m_file_info()
+	{
+	}
+
+	Datafile(const Datafile& file)
+		:
+		m_handle(file.m_handle),
+		m_open_flags(file.m_open_flags),
+		m_size(file.m_size),
+		m_order(file.m_order),
+		m_type(file.m_type),
+		m_space_id(file.m_space_id),
+		m_flags(file.m_flags),
+		m_exists(file.m_exists),
+		m_is_valid(file.m_is_valid),
+		m_first_page(),
+		m_last_os_error(),
+		m_file_info()
+	{
+		if (file.m_filepath != NULL) {
+			m_filepath = mem_strdup(file.m_filepath);
+			ut_a(m_filepath != NULL);
+			set_filename();
+		} else {
+			m_filepath = NULL;
+			m_filename = NULL;
+		}
+	}
+
+	virtual ~Datafile()
+	{
+		shutdown();
+	}
+
+	Datafile& operator=(const Datafile& file)
+	{
+		ut_a(this != &file);
+
+		m_size = file.m_size;
+		m_order = file.m_order;
+		m_type = file.m_type;
+
+		ut_a(m_handle == OS_FILE_CLOSED);
+		m_handle = file.m_handle;
+
+		m_exists = file.m_exists;
+		m_is_valid = file.m_is_valid;
+		m_open_flags = file.m_open_flags;
+		m_space_id = file.m_space_id;
+		m_flags = file.m_flags;
+		m_last_os_error = 0;
+
+		if (m_filepath != NULL) {
+			ut_free(m_filepath);
+			m_filepath = NULL;
+			m_filename = NULL;
+		}
+
+		if (file.m_filepath != NULL) {
+			m_filepath = mem_strdup(file.m_filepath);
+			ut_a(m_filepath != NULL);
+			set_filename();
+		}
+
+		/* Do not make a copy of the first page,
+		it should be reread if needed */
+		m_first_page = NULL;
+
+		return(*this);
+	}
+
+	/** Initialize the tablespace flags */
+	void init(uint32_t flags) { m_flags= flags; }
+
+	/** Release the resources. */
+	virtual void shutdown();
+
+	/** Open a data file in read-only mode to check if it exists
+	so that it can be validated.
+	@param[in]	strict	whether to issue error messages
+	@return DB_SUCCESS or error code */
+	dberr_t open_read_only(bool strict);
+
+	/** Open a data file in read-write mode during start-up so that
+	doublewrite pages can be restored and then it can be validated.
+	@return DB_SUCCESS or error code */
+	inline dberr_t open_read_write()
+		MY_ATTRIBUTE((warn_unused_result));
+
+	/** Initialize OS specific file info. */
+	void init_file_info();
+
+	/** Close a data file.
+	@return DB_SUCCESS or error code */
+	dberr_t close();
+
+	/** Make a full filepath from a directory path and a filename.
+	Prepend the dirpath to filename using the extension given.
+	If dirpath is NULL, prepend the default datadir to filepath.
+	Store the result in m_filepath.
+	@param dirpath  directory path
+	@param name     tablespace (table) name
+	@param ext      filename extension */
+	void make_filepath(const char* dirpath, fil_space_t::name_type name,
+			   ib_extention ext);
+
+	/** Set the filepath by duplicating the filepath sent in */
+	void set_filepath(const char* filepath);
+
+	/** Validates the datafile and checks that it conforms with
+	the expected space ID and flags.  The file should exist and be
+	successfully opened in order for this function to validate it.
+	@param[in]	space_id	The expected tablespace ID.
+	@param[in]	flags		The expected tablespace flags.
+	@retval DB_SUCCESS if tablespace is valid, DB_ERROR if not.
+	m_is_valid is also set true on success, else false. */
+	dberr_t validate_to_dd(uint32_t space_id, uint32_t flags)
+		MY_ATTRIBUTE((warn_unused_result));
+
+	/** Validates this datafile for the purpose of recovery.
+	The file should exist and be successfully opened. We initially
+	open it in read-only mode because we just want to read the SpaceID.
+	However, if the first page is corrupt and needs to be restored
+	from the doublewrite buffer, we will reopen it in write mode and
+	ry to restore that page.
+	@retval DB_SUCCESS if tablespace is valid, DB_ERROR if not.
+	m_is_valid is also set true on success, else false. */
+	dberr_t validate_for_recovery()
+		MY_ATTRIBUTE((warn_unused_result));
+
+	/** Checks the consistency of the first page of a datafile when the
+	tablespace is opened.  This occurs before the fil_space_t is created
+	so the Space ID found here must not already be open.
+	m_is_valid is set true on success, else false.
+	@retval DB_SUCCESS on if the datafile is valid
+	@retval DB_CORRUPTION if the datafile is not readable
+	@retval DB_TABLESPACE_EXISTS if there is a duplicate space_id */
+	dberr_t validate_first_page()
+		MY_ATTRIBUTE((warn_unused_result));
+
+	/** Get Datafile::m_filepath.
+	@return m_filepath */
+	const char*	filepath()	const
+	{
+		return(m_filepath);
+	}
+
+	/** Get Datafile::m_handle.
+	@return m_handle */
+	pfs_os_file_t	handle()	const
+	{
+		return(m_handle);
+	}
+
+	/** @return detached file handle */
+	pfs_os_file_t detach()
+	{
+		pfs_os_file_t detached = m_handle;
+		m_handle = OS_FILE_CLOSED;
+		return detached;
+	}
+
+	/** Get Datafile::m_order.
+	@return m_order */
+	ulint	order()	const
+	{
+		return(m_order);
+	}
+
+	/** Get Datafile::m_space_id.
+	@return m_space_id */
+	uint32_t space_id() const { return m_space_id; }
+
+	/** Get Datafile::m_flags.
+	@return m_flags */
+	uint32_t flags() const { return m_flags; }
+
+	/**
+	@return true if m_handle is open, false if not */
+	bool is_open() const { return m_handle != OS_FILE_CLOSED; }
+
+	/** Get Datafile::m_is_valid.
+	@return m_is_valid */
+	bool	is_valid()	const
+	{
+		return(m_is_valid);
+	}
+
+	/** Get the last OS error reported
+	@return m_last_os_error */
+	ulint	last_os_error()		const
+	{
+		return(m_last_os_error);
+	}
+
+	/** Check whether the file is empty.
+	@return true if file is empty */
+	bool	is_empty_file()		const
+	{
+#ifdef _WIN32
+		os_offset_t	offset =
+			(os_offset_t) m_file_info.nFileSizeLow
+			| ((os_offset_t) m_file_info.nFileSizeHigh << 32);
+
+		return (offset == 0);
+#else
+		return (m_file_info.st_size == 0);
+#endif
+	}
+
+	/** Check if the file exist.
+	@return true if file exists. */
+	bool exists()	const { return m_exists; }
+
+	/** Test if the filepath provided looks the same as this filepath
+	by string comparison. If they are two different paths to the same
+	file, same_as() will be used to show that after the files are opened.
+	@param[in]	other	filepath to compare with
+	@retval true if it is the same filename by char comparison
+	@retval false if it looks different */
+	bool same_filepath_as(const char* other) const;
+
+	/** Test if another opened datafile is the same file as this object.
+	@param[in]	other	Datafile to compare with
+	@return true if it is the same file, else false */
+	bool same_as(const Datafile&	other) const;
+
+	/** Get access to the first data page.
+	It is valid after open_read_only() succeeded.
+	@return the first data page */
+	const byte* get_first_page() const { return(m_first_page); }
+
+	void set_space_id(uint32_t space_id) { m_space_id= space_id; }
+
+	void set_flags(uint32_t flags) { m_flags = flags; }
+private:
+	/** Free the filepath buffer. */
+	void free_filepath();
+
+	/** Set the filename pointer to the start of the file name
+	in the filepath. */
+	void set_filename()
+	{
+		if (!m_filepath) {
+			return;
+		}
+
+		if (char *last_slash = strrchr(m_filepath, '/')) {
+#if _WIN32
+			if (char *last = strrchr(m_filepath, '\\')) {
+				if (last > last_slash) {
+					last_slash = last;
+				}
+			}
+#endif
+			m_filename = last_slash + 1;
+		} else {
+			m_filename = m_filepath;
+		}
+	}
+
+	/** Create/open a data file.
+	@param[in]	read_only_mode	if true, then readonly mode checks
+					are enforced.
+	@return DB_SUCCESS or error code */
+	dberr_t open_or_create(bool read_only_mode)
+		MY_ATTRIBUTE((warn_unused_result));
+
+	/** Reads a few significant fields from the first page of the
+	datafile, which must already be open.
+	@param[in]	read_only_mode	if true, then readonly mode checks
+					are enforced.
+	@return DB_SUCCESS or DB_IO_ERROR if page cannot be read */
+	dberr_t read_first_page(bool read_only_mode)
+		MY_ATTRIBUTE((warn_unused_result));
+
+	/** Free the first page from memory when it is no longer needed. */
+	void free_first_page();
+
+	/** Set the Datafile::m_open_flags.
+	@param open_flags	The Open flags to set. */
+	void set_open_flags(os_file_create_t	open_flags)
+	{
+		m_open_flags = open_flags;
+	};
+
+	/** Determine if this datafile is on a Raw Device
+	@return true if it is a RAW device. */
+	bool is_raw_device()
+	{
+		return(m_type != SRV_NOT_RAW);
+	}
+
+	/* DATA MEMBERS */
+
+protected:
+	/** Physical file path with base name and extension */
+	char*			m_filepath;
+
+private:
+	/** Determine the space id of the given file descriptor by reading
+	a few pages from the beginning of the .ibd file.
+	@return DB_SUCCESS if space id was successfully identified,
+	else DB_ERROR. */
+	dberr_t find_space_id();
+
+	/** Points into m_filepath to the file name with extension */
+	char*			m_filename;
+
+	/** Open file handle */
+	pfs_os_file_t		m_handle;
+
+	/** Flags to use for opening the data file */
+	os_file_create_t	m_open_flags;
+
+	/** size in megabytes or pages; converted from megabytes to
+	pages in SysTablespace::normalize_size() */
+	uint32_t		m_size;
+
+	/** ordinal position of this datafile in the tablespace */
+	ulint			m_order;
+
+	/** The type of the data file */
+	device_t		m_type;
+
+	/** Tablespace ID. Contained in the datafile header.
+	If this is a system tablespace, FSP_SPACE_ID is only valid
+	in the first datafile. */
+	uint32_t		m_space_id;
+
+	/** Tablespace flags. Contained in the datafile header.
+	If this is a system tablespace, FSP_SPACE_FLAGS are only valid
+	in the first datafile. */
+	uint32_t		m_flags;
+
+	/** true if file already existed on startup */
+	bool			m_exists;
+
+	/* true if the tablespace is valid */
+	bool			m_is_valid;
+
+	/** Aligned buffer to hold first page */
+	byte*			m_first_page;
+
+protected:
+	/** Last OS error received so it can be reported if needed. */
+	ulint			m_last_os_error;
+
+public:
+	/** true if table is deferred during recovery */
+	bool			m_defer=false;
+	/** Use the following to determine the uniqueness of this datafile. */
+#ifdef _WIN32
+	/* Use fields dwVolumeSerialNumber, nFileIndexLow, nFileIndexHigh. */
+	BY_HANDLE_FILE_INFORMATION	m_file_info;
+#else
+	/* Use field st_ino. */
+	struct stat			m_file_info;
+#endif	/* WIN32 */
+};
+
+
+/** Data file control information. */
+class RemoteDatafile : public Datafile
+{
+private:
+	/** Link filename (full path) */
+	char*	m_link_filepath;
+
+public:
+
+	RemoteDatafile()
+		:
+		m_link_filepath()
+	{
+		/* No op - base constructor is called. */
+	}
+
+	RemoteDatafile(const char*, ulint, ulint)
+		:
+		m_link_filepath()
+	{
+		/* No op - base constructor is called. */
+	}
+
+	~RemoteDatafile() override
+	{
+		shutdown();
+	}
+
+	/** Release the resources. */
+	void shutdown() override;
+
+	/** Get the link filepath.
+	@return m_link_filepath */
+	const char*	link_filepath()	const
+	{
+		return(m_link_filepath);
+	}
+
+	/** Attempt to read the contents of an .isl file into m_filepath.
+	@param name   table name
+	@return filepath()
+	@retval nullptr  if the .isl file does not exist or cannot be read */
+	const char* open_link_file(const fil_space_t::name_type name);
+
+	/** Delete an InnoDB Symbolic Link (ISL) file. */
+	void delete_link_file(void);
+
+	/******************************************************************
+	Global Static Functions;  Cannot refer to data members.
+	******************************************************************/
+
+	/** Create InnoDB Symbolic Link (ISL) file.
+	@param name     tablespace name
+	@param filepath full file name
+	@return DB_SUCCESS or error code */
+	static dberr_t create_link_file(fil_space_t::name_type name,
+					const char *filepath);
+
+	/** Delete an InnoDB Symbolic Link (ISL) file by name.
+	@param name   tablespace name */
+	static void delete_link_file(fil_space_t::name_type name);
+};
+#endif /* fsp0file_h */
diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h
new file mode 100644
index 00000000..26261554
--- /dev/null
+++ b/storage/innobase/include/fsp0fsp.h
@@ -0,0 +1,762 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0fsp.h
+File space management
+
+Created 12/18/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef fsp0fsp_h
+#define fsp0fsp_h
+
+#include "assume_aligned.h"
+#include "fsp0types.h"
+#include "fut0lst.h"
+#include "ut0byte.h"
+
+#ifndef UNIV_INNOCHECKSUM
+#include "mtr0mtr.h"
+#include "page0types.h"
+#include "rem0types.h"
+#else
+# include "mach0data.h"
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** @return the PAGE_SSIZE flags for the current innodb_page_size */
+#define FSP_FLAGS_PAGE_SSIZE()						\
+	((srv_page_size == UNIV_PAGE_SIZE_ORIG) ?			\
+	 0U : (srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1)	\
+	 << FSP_FLAGS_POS_PAGE_SSIZE)
+
+/** @return the PAGE_SSIZE flags for the current innodb_page_size in
+full checksum format */
+#define FSP_FLAGS_FCRC32_PAGE_SSIZE()					\
+	((srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1)		\
+	<< FSP_FLAGS_FCRC32_POS_PAGE_SSIZE)
+
+/* @defgroup Compatibility macros for MariaDB 10.1.0 through 10.1.20;
+see the table in fsp0types.h @{ */
+/** Zero relative shift position of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101	\
+	(FSP_FLAGS_POS_ATOMIC_BLOBS			\
+	 + FSP_FLAGS_WIDTH_ATOMIC_BLOBS)
+/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */
+#define FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101	\
+	(FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101 + 1)
+/** Zero relative shift position of the ATOMIC_WRITES field */
+#define FSP_FLAGS_POS_ATOMIC_WRITES_MARIADB101		\
+	(FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101 + 4)
+/** Zero relative shift position of the PAGE_SSIZE field */
+#define FSP_FLAGS_POS_PAGE_SSIZE_MARIADB101		\
+	(FSP_FLAGS_POS_ATOMIC_WRITES_MARIADB101 + 2)
+
+/** Bit mask of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_MASK_PAGE_COMPRESSION_MARIADB101		\
+	(1U << FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101)
+/** Bit mask of the PAGE_COMPRESSION_LEVEL field */
+#define FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL_MARIADB101	\
+	(15U << FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101)
+/** Bit mask of the ATOMIC_WRITES field */
+#define FSP_FLAGS_MASK_ATOMIC_WRITES_MARIADB101			\
+	(3U << FSP_FLAGS_POS_ATOMIC_WRITES_MARIADB101)
+/** Bit mask of the PAGE_SSIZE field */
+#define FSP_FLAGS_MASK_PAGE_SSIZE_MARIADB101			\
+	(15U << FSP_FLAGS_POS_PAGE_SSIZE_MARIADB101)
+
+/** Return the value of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_GET_PAGE_COMPRESSION_MARIADB101(flags)	\
+		((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION_MARIADB101)	\
+		>> FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101)
+/** Return the value of the PAGE_COMPRESSION_LEVEL field */
+#define FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL_MARIADB101(flags)	\
+		((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL_MARIADB101) \
+		>> FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101)
+/** Return the value of the PAGE_SSIZE field */
+#define FSP_FLAGS_GET_PAGE_SSIZE_MARIADB101(flags)		\
+		((flags & FSP_FLAGS_MASK_PAGE_SSIZE_MARIADB101)	\
+		>> FSP_FLAGS_POS_PAGE_SSIZE_MARIADB101)
+
+/* @} */
+
+/* @defgroup Tablespace Header Constants (moved from fsp0fsp.c) @{ */
+
+/** Offset of the space header within a file page */
+#define FSP_HEADER_OFFSET	FIL_PAGE_DATA
+
+/* The data structures in files are defined just as byte strings in C */
+typedef	byte	xdes_t;
+
+/*			SPACE HEADER
+			============
+
+File space header data structure: this data structure is contained in the
+first page of a space. The space for this header is reserved in every extent
+descriptor page, but used only in the first. */
+
+/*-------------------------------------*/
+#define FSP_SPACE_ID		0	/* space id */
+#define FSP_NOT_USED		4	/* this field contained a value up to
+					which we know that the modifications
+					in the database have been flushed to
+					the file space; not used now */
+#define	FSP_SIZE		8	/* Current size of the space in
+					pages */
+#define	FSP_FREE_LIMIT		12	/* Minimum page number for which the
+					free list has not been initialized:
+					the pages >= this limit are, by
+					definition, free; note that in a
+					single-table tablespace where size
+					< 64 pages, this number is 64, i.e.,
+					we have initialized the space
+					about the first extent, but have not
+					physically allocated those pages to the
+					file */
+#define	FSP_SPACE_FLAGS		16	/* fsp_space_t.flags, similar to
+					dict_table_t::flags */
+#define	FSP_FRAG_N_USED		20	/* number of used pages in the
+					FSP_FREE_FRAG list */
+#define	FSP_FREE		24	/* list of free extents */
+#define	FSP_FREE_FRAG		(24 + FLST_BASE_NODE_SIZE)
+					/* list of partially free extents not
+					belonging to any segment */
+#define	FSP_FULL_FRAG		(24 + 2 * FLST_BASE_NODE_SIZE)
+					/* list of full extents not belonging
+					to any segment */
+#define FSP_SEG_ID		(24 + 3 * FLST_BASE_NODE_SIZE)
+					/* 8 bytes which give the first unused
+					segment id */
+#define FSP_SEG_INODES_FULL	(32 + 3 * FLST_BASE_NODE_SIZE)
+					/* list of pages containing segment
+					headers, where all the segment inode
+					slots are reserved */
+#define FSP_SEG_INODES_FREE	(32 + 4 * FLST_BASE_NODE_SIZE)
+					/* list of pages containing segment
+					headers, where not all the segment
+					header slots are reserved */
+/*-------------------------------------*/
+/* File space header size */
+#define	FSP_HEADER_SIZE		(32 + 5 * FLST_BASE_NODE_SIZE)
+
+#define	FSP_FREE_ADD		4	/* this many free extents are added
+					to the free list from above
+					FSP_FREE_LIMIT at a time */
+/* @} */
+
+/* @defgroup File Segment Inode Constants (moved from fsp0fsp.c) @{ */
+
+/*			FILE SEGMENT INODE
+			==================
+
+Segment inode which is created for each segment in a tablespace. NOTE: in
+purge we assume that a segment having only one currently used page can be
+freed in a few steps, so that the freeing cannot fill the file buffer with
+bufferfixed file pages. */
+
+typedef	byte	fseg_inode_t;
+
+#define FSEG_INODE_PAGE_NODE	FSEG_PAGE_DATA
+					/* the list node for linking
+					segment inode pages */
+
+#define FSEG_ARR_OFFSET		(FSEG_PAGE_DATA + FLST_NODE_SIZE)
+/*-------------------------------------*/
+#define	FSEG_ID			0	/* 8 bytes of segment id: if this is 0,
+					it means that the header is unused */
+#define FSEG_NOT_FULL_N_USED	8
+					/* number of used segment pages in
+					the FSEG_NOT_FULL list */
+#define	FSEG_FREE		12
+					/* list of free extents of this
+					segment */
+#define	FSEG_NOT_FULL		(12 + FLST_BASE_NODE_SIZE)
+					/* list of partially free extents */
+#define	FSEG_FULL		(12 + 2 * FLST_BASE_NODE_SIZE)
+					/* list of full extents */
+#define	FSEG_MAGIC_N		(12 + 3 * FLST_BASE_NODE_SIZE)
+					/* magic number used in debugging */
+#define	FSEG_FRAG_ARR		(16 + 3 * FLST_BASE_NODE_SIZE)
+					/* array of individual pages
+					belonging to this segment in fsp
+					fragment extent lists */
+#define FSEG_FRAG_ARR_N_SLOTS	(FSP_EXTENT_SIZE / 2)
+					/* number of slots in the array for
+					the fragment pages */
+#define	FSEG_FRAG_SLOT_SIZE	4	/* a fragment page slot contains its
+					page number within space, FIL_NULL
+					means that the slot is not in use */
+/*-------------------------------------*/
+#define FSEG_INODE_SIZE					\
+	(16 + 3 * FLST_BASE_NODE_SIZE			\
+	 + FSEG_FRAG_ARR_N_SLOTS * FSEG_FRAG_SLOT_SIZE)
+
+static constexpr byte FSEG_MAGIC_N_BYTES[4]={0x05,0xd6,0x69,0xd2};
+
+#define	FSEG_FILLFACTOR		8	/* If the number of unused but reserved
+					pages in a segment is less than
+					reserved pages / FSEG_FILLFACTOR,
+					and there are
+					at least FSEG_FRAG_LIMIT used pages,
+					then we allow a new empty extent to
+					be added to the segment in
+					fseg_alloc_free_page_general().
+					Otherwise, we
+					use unused pages of the segment. */
+
+#define FSEG_FRAG_LIMIT		FSEG_FRAG_ARR_N_SLOTS
+					/* If the segment has >= this many
+					used pages, it may be expanded by
+					allocating extents to the segment;
+					until that only individual fragment
+					pages are allocated from the space */
+
+#define	FSEG_FREE_LIST_LIMIT	40	/* If the reserved size of a segment
+					is at least this many extents, we
+					allow extents to be put to the free
+					list of the extent: at most
+					FSEG_FREE_LIST_MAX_LEN many */
+#define	FSEG_FREE_LIST_MAX_LEN	4
+/* @} */
+
+/* @defgroup Extent Descriptor Constants (moved from fsp0fsp.c) @{ */
+
+/*			EXTENT DESCRIPTOR
+			=================
+
+File extent descriptor data structure: contains bits to tell which pages in
+the extent are free and which contain old tuple version to clean. */
+
+/*-------------------------------------*/
+#define	XDES_ID			0	/* The identifier of the segment
+					to which this extent belongs */
+#define XDES_FLST_NODE		8	/* The list node data structure
+					for the descriptors */
+#define	XDES_STATE		(FLST_NODE_SIZE + 8)
+					/* contains state information
+					of the extent */
+#define	XDES_BITMAP		(FLST_NODE_SIZE + 12)
+					/* Descriptor bitmap of the pages
+					in the extent */
+/*-------------------------------------*/
+
+#define	XDES_BITS_PER_PAGE	2	/* How many bits are there per page */
+#define	XDES_FREE_BIT		0	/* Index of the bit which tells if
+					the page is free */
+#define	XDES_CLEAN_BIT		1	/* NOTE: currently not used!
+					Index of the bit which tells if
+					there are old versions of tuples
+					on the page */
+/* States of a descriptor */
+#define	XDES_FREE		1	/* extent is in free list of space */
+#define	XDES_FREE_FRAG		2	/* extent is in free fragment list of
+					space */
+#define	XDES_FULL_FRAG		3	/* extent is in full fragment list of
+					space */
+#define	XDES_FSEG		4	/* extent belongs to a segment */
+
+/** File extent data structure size in bytes. */
+#define	XDES_SIZE							\
+	(XDES_BITMAP							\
+	+ UT_BITS_IN_BYTES(FSP_EXTENT_SIZE * XDES_BITS_PER_PAGE))
+
+/** File extent data structure size in bytes for MAX page size. */
+#define	XDES_SIZE_MAX							\
+	(XDES_BITMAP							\
+	+ UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MAX * XDES_BITS_PER_PAGE))
+
+/** File extent data structure size in bytes for MIN page size. */
+#define	XDES_SIZE_MIN							\
+	(XDES_BITMAP							\
+	+ UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MIN * XDES_BITS_PER_PAGE))
+
+/** Offset of the descriptor array on a descriptor page */
+#define	XDES_ARR_OFFSET		(FSP_HEADER_OFFSET + FSP_HEADER_SIZE)
+
+/**
+Determine if a page is marked free.
+@param[in]	descr	extent descriptor
+@param[in]	offset	page offset within extent
+@return whether the page is free */
+inline bool xdes_is_free(const xdes_t *descr, ulint offset)
+{
+  ut_ad(offset < FSP_EXTENT_SIZE);
+  ulint index= XDES_FREE_BIT + XDES_BITS_PER_PAGE * offset;
+  return ut_bit_get_nth(descr[XDES_BITMAP + (index >> 3)], index & 7);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/* @} */
+
+/** Read a tablespace header field.
+@param[in]	page	first page of a tablespace
+@param[in]	field	the header field
+@return the contents of the header field */
+inline uint32_t fsp_header_get_field(const page_t* page, ulint field)
+{
+  return mach_read_from_4(FSP_HEADER_OFFSET + field +
+			  my_assume_aligned<UNIV_ZIP_SIZE_MIN>(page));
+}
+
+/** Read the flags from the tablespace header page.
+@param[in]	page	first page of a tablespace
+@return the contents of FSP_SPACE_FLAGS */
+inline uint32_t fsp_header_get_flags(const page_t *page)
+{
+  return fsp_header_get_field(page, FSP_SPACE_FLAGS);
+}
+
+/** Get the byte offset of encryption information in page 0.
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@return	byte offset relative to FSP_HEADER_OFFSET */
+inline MY_ATTRIBUTE((pure, warn_unused_result))
+ulint fsp_header_get_encryption_offset(ulint zip_size)
+{
+	return zip_size
+		? XDES_ARR_OFFSET + XDES_SIZE * zip_size / FSP_EXTENT_SIZE
+		: XDES_ARR_OFFSET + (XDES_SIZE << srv_page_size_shift)
+		/ FSP_EXTENT_SIZE;
+}
+
+/** Check the encryption key from the first page of a tablespace.
+@param[in]	fsp_flags	tablespace flags
+@param[in]	page		first page of a tablespace
+@return true if success */
+bool
+fsp_header_check_encryption_key(
+	ulint			fsp_flags,
+	page_t*			page);
+
+/** Initialize a tablespace header.
+@param[in,out]	space	tablespace
+@param[in]	size	current size in blocks
+@param[in,out]	mtr	mini-transaction
+@return error code */
+dberr_t fsp_header_init(fil_space_t *space, uint32_t size, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Create a new segment.
+@param space                tablespace
+@param byte_offset          byte offset of the created segment header
+@param mtr                  mini-transaction
+@param err                  error code
+@param has_done_reservation whether fsp_reserve_free_extents() was invoked
+@param block                block where segment header is placed,
+                            or NULL to allocate an additional page for that
+@return the block where the segment header is placed, x-latched
+@retval nullptr if could not create segment */
+buf_block_t*
+fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr, dberr_t *err,
+            bool has_done_reservation= false, buf_block_t *block= nullptr)
+  MY_ATTRIBUTE((nonnull(1,3,4), warn_unused_result));
+
+/** Calculate the number of pages reserved by a segment,
+and how many pages are currently used.
+@param[in]      block   buffer block containing the file segment header
+@param[in]      header  file segment header
+@param[out]     used    number of pages that are used (not more than reserved)
+@param[in,out]  mtr     mini-transaction
+@return number of reserved pages */
+ulint fseg_n_reserved_pages(const buf_block_t &block,
+                            const fseg_header_t *header, ulint *used,
+                            mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@retval NULL if no page could be allocated */
+buf_block_t*
+fseg_alloc_free_page_general(
+/*=========================*/
+	fseg_header_t*	seg_header,/*!< in/out: segment header */
+	uint32_t	hint,	/*!< in: hint of which page would be
+				desirable */
+	byte		direction,/*!< in: if the new page is needed because
+				of an index page split, and records are
+				inserted there in order, into which
+				direction they go alphabetically: FSP_DOWN,
+				FSP_UP, FSP_NO_DIR */
+	bool		has_done_reservation, /*!< in: true if the caller has
+				already done the reservation for the page
+				with fsp_reserve_free_extents, then there
+				is no need to do the check for this individual
+				page */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	mtr_t*		init_mtr,/*!< in/out: mtr or another mini-transaction
+				in which the page should be initialized. */
+	dberr_t*	err)	/*!< out: error code */
+	MY_ATTRIBUTE((warn_unused_result, nonnull));
+
+/** Reserves free pages from a tablespace. All mini-transactions which may
+use several pages from the tablespace should call this function beforehand
+and reserve enough free extents so that they certainly will be able
+to do their operation, like a B-tree page split, fully. Reservations
+must be released with function fil_space_t::release_free_extents()!
+
+The alloc_type below has the following meaning: FSP_NORMAL means an
+operation which will probably result in more space usage, like an
+insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are
+deleting rows, then this allocation will in the long run result in
+less space usage (after a purge); FSP_CLEANING means allocation done
+in a physical record delete (like in a purge) or other cleaning operation
+which will result in less space usage in the long run. We prefer the latter
+two types of allocation: when space is scarce, FSP_NORMAL allocations
+will not succeed, but the latter two allocations will succeed, if possible.
+The purpose is to avoid dead end where the database is full but the
+user cannot free any space because these freeing operations temporarily
+reserve some space.
+
+Single-table tablespaces whose size is < FSP_EXTENT_SIZE pages are a special
+case. In this function we would liberally reserve several extents for
+every page split or merge in a B-tree. But we do not want to waste disk space
+if the table only occupies < FSP_EXTENT_SIZE pages. That is why we apply
+different rules in that special case, just ensuring that there are n_pages
+free pages available.
+
+@param[out]     n_reserved      number of extents actually reserved; if we
+                                return true and the tablespace size is <
+                                FSP_EXTENT_SIZE pages, then this can be 0,
+                                otherwise it is n_ext
+@param[in,out]  space           tablespace
+@param[in]      n_ext           number of extents to reserve
+@param[in]      alloc_type      page reservation type (FSP_BLOB, etc)
+@param[in,out]  mtr             the mini transaction
+@param[out]     err             error code
+@param[in]      n_pages         for small tablespaces (tablespace size is
+                                less than FSP_EXTENT_SIZE), number of free
+                                pages to reserve.
+@return error code
+@retval DB_SUCCESS if we were able to make the reservation */
+dberr_t
+fsp_reserve_free_extents(
+	uint32_t*	n_reserved,
+	fil_space_t*	space,
+	uint32_t	n_ext,
+	fsp_reserve_t	alloc_type,
+	mtr_t*		mtr,
+	uint32_t	n_pages = 2);
+
+/** Free a page in a file segment.
+@param[in,out]	seg_header	file segment header
+@param[in,out]	space		tablespace
+@param[in]	offset		page number
+@param[in,out]	mtr		mini-transaction
+@param[in]	have_latch	whether space->x_lock() was already called
+@return error code */
+dberr_t
+fseg_free_page(
+	fseg_header_t*	seg_header,
+	fil_space_t*	space,
+	uint32_t	offset,
+	mtr_t*		mtr,
+	bool		have_latch = false)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Determine whether a page is allocated.
+@param space   tablespace
+@param page    page number
+@return error code
+@retval DB_SUCCESS             if the page is marked as free
+@retval DB_SUCCESS_LOCKED_REC  if the page is marked as allocated */
+dberr_t fseg_page_is_allocated(fil_space_t *space, unsigned page)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Frees part of a segment. This function can be used to free
+a segment by repeatedly calling this function in different
+mini-transactions. Doing the freeing in a single mini-transaction
+might result in too big a mini-transaction.
+@param	header	segment header; NOTE: if the header resides on first
+		page of the frag list of the segment, this pointer
+		becomes obsolete after the last freeing step
+@param	mtr	mini-transaction
+@param	ahi	Drop the adaptive hash index
+@return whether the freeing was completed */
+bool
+fseg_free_step(
+	fseg_header_t*	header,
+	mtr_t*		mtr
+#ifdef BTR_CUR_HASH_ADAPT
+	,bool		ahi=false
+#endif /* BTR_CUR_HASH_ADAPT */
+	)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Frees part of a segment. Differs from fseg_free_step because
+this function leaves the header page unfreed.
+@param	header	segment header which must reside on the first
+		fragment page of the segment
+@param	mtr	mini-transaction
+@param	ahi	drop the adaptive hash index
+@return whether the freeing was completed, except for the header page */
+bool
+fseg_free_step_not_header(
+	fseg_header_t*	header,
+	mtr_t*		mtr
+#ifdef BTR_CUR_HASH_ADAPT
+	,bool		ahi=false
+#endif /* BTR_CUR_HASH_ADAPT */
+	)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Reset the page type.
+Data files created before MySQL 5.1.48 may contain garbage in FIL_PAGE_TYPE.
+In MySQL 3.23.53, only undo log pages and index pages were tagged.
+Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE.
+@param[in]	block	block with invalid FIL_PAGE_TYPE
+@param[in]	type	expected page type
+@param[in,out]	mtr	mini-transaction */
+ATTRIBUTE_COLD
+void fil_block_reset_type(const buf_block_t& block, ulint type, mtr_t* mtr);
+
+/** Check (and if needed, reset) the page type.
+Data files created before MySQL 5.1.48 may contain
+garbage in the FIL_PAGE_TYPE field.
+In MySQL 3.23.53, only undo log pages and index pages were tagged.
+Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE.
+@param[in]	page_id	page number
+@param[in,out]	page	page with possibly invalid FIL_PAGE_TYPE
+@param[in]	type	expected page type
+@param[in,out]	mtr	mini-transaction */
+inline void
+fil_block_check_type(
+	const buf_block_t&	block,
+	ulint			type,
+	mtr_t*			mtr)
+{
+  if (UNIV_UNLIKELY(type != fil_page_get_type(block.page.frame)))
+    fil_block_reset_type(block, type, mtr);
+}
+
+/** Checks if a page address is an extent descriptor page address.
+@param[in]	page_id		page id
+@param[in]	physical_size	page size
+@return whether a descriptor page */
+inline bool fsp_descr_page(const page_id_t page_id, ulint physical_size)
+{
+	return (page_id.page_no() & (physical_size - 1)) == FSP_XDES_OFFSET;
+}
+
+/** Initialize a file page whose prior contents should be ignored.
+@param[in,out]	block	buffer pool block */
+void fsp_apply_init_file_page(buf_block_t *block);
+
+/** Initialize a file page.
+@param[in]	space	tablespace
+@param[in,out]	block	file page
+@param[in,out]	mtr	mini-transaction */
+inline void fsp_init_file_page(
+#ifdef UNIV_DEBUG
+	const fil_space_t* space,
+#endif
+	buf_block_t* block, mtr_t* mtr)
+{
+	ut_d(space->modify_check(*mtr));
+	ut_ad(space->id == block->page.id().space());
+	fsp_apply_init_file_page(block);
+	mtr->init(block);
+}
+
+#ifndef UNIV_DEBUG
+# define fsp_init_file_page(space, block, mtr) fsp_init_file_page(block, mtr)
+#endif
+
+#ifdef UNIV_BTR_PRINT
+/*******************************************************************//**
+Writes info of a segment. */
+void
+fseg_print(
+/*=======*/
+	fseg_header_t*	header, /*!< in: segment header */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+#endif /* UNIV_BTR_PRINT */
+
+/** Convert FSP_SPACE_FLAGS from the buggy MariaDB 10.1.0..10.1.20 format.
+@param[in]	flags	the contents of FSP_SPACE_FLAGS
+@return	the flags corrected from the buggy MariaDB 10.1 format
+@retval	UINT32_MAX  if the flags are not in the buggy 10.1 format */
+MY_ATTRIBUTE((warn_unused_result, const))
+inline uint32_t fsp_flags_convert_from_101(uint32_t flags)
+{
+	DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return UINT32_MAX;);
+	if (flags == 0 || fil_space_t::full_crc32(flags)) {
+		return(flags);
+	}
+
+	if (flags >> 18) {
+		/* The most significant FSP_SPACE_FLAGS bit that was ever set
+		by MariaDB 10.1.0 to 10.1.20 was bit 17 (misplaced DATA_DIR flag).
+		The flags must be less than 1<<18 in order to be valid. */
+		return UINT32_MAX;
+	}
+
+	if ((flags & (FSP_FLAGS_MASK_POST_ANTELOPE | FSP_FLAGS_MASK_ATOMIC_BLOBS))
+	    == FSP_FLAGS_MASK_ATOMIC_BLOBS) {
+		/* If the "atomic blobs" flag (indicating
+		ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED) flag
+		is set, then the "post Antelope" (ROW_FORMAT!=REDUNDANT) flag
+		must also be set. */
+		return UINT32_MAX;
+	}
+
+	/* Bits 6..10 denote compression in MariaDB 10.1.0 to 10.1.20.
+	They must be either 0b00000 or 0b00011 through 0b10011.
+	In correct versions, these bits would be
+	0bd0sss where d is the DATA_DIR flag (garbage bit) and
+	sss is the PAGE_SSIZE (3, 4, 6, or 7).
+
+	NOTE: MariaDB 10.1.0 to 10.1.20 can misinterpret
+	uncompressed data files with innodb_page_size=4k or 64k as
+	compressed innodb_page_size=16k files. Below is an exhaustive
+	state space analysis.
+
+	-0by1zzz: impossible (the bit 4 must be clean; see above)
+	-0b101xx: DATA_DIR, innodb_page_size>4k: invalid (COMPRESSION_LEVEL>9)
+	+0bx0011: innodb_page_size=4k:
+	!!!	Misinterpreted as COMPRESSION_LEVEL=9 or 1, COMPRESSION=1.
+	-0bx0010: impossible, because sss must be 0b011 or 0b1xx
+	-0bx0001: impossible, because sss must be 0b011 or 0b1xx
+	-0b10000: DATA_DIR, innodb_page_size=16:
+	invalid (COMPRESSION_LEVEL=8 but COMPRESSION=0)
+	+0b00111: no DATA_DIR, innodb_page_size=64k:
+	!!!	Misinterpreted as COMPRESSION_LEVEL=3, COMPRESSION=1.
+	-0b00101: impossible, because sss must be 0 for 16k, not 0b101
+	-0b001x0: no DATA_DIR, innodb_page_size=32k or 8k:
+	invalid (COMPRESSION_LEVEL=3 but COMPRESSION=0)
+	+0b00000: innodb_page_size=16k (looks like COMPRESSION=0)
+	???	Could actually be compressed; see PAGE_SSIZE below */
+	const uint32_t level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL_MARIADB101(
+		flags);
+	if (FSP_FLAGS_GET_PAGE_COMPRESSION_MARIADB101(flags) != (level != 0)
+	    || level > 9) {
+		/* The compression flags are not in the buggy MariaDB
+		10.1 format. */
+		return UINT32_MAX;
+	}
+	if (!(~flags & FSP_FLAGS_MASK_ATOMIC_WRITES_MARIADB101)) {
+		/* The ATOMIC_WRITES flags cannot be 0b11.
+		(The bits 11..12 should actually never be 0b11,
+		because in MySQL they would be SHARED|TEMPORARY.) */
+		return UINT32_MAX;
+	}
+
+	/* Bits 13..16 are the wrong position for PAGE_SSIZE, and they
+	should contain one of the values 3,4,6,7, that is, be of the form
+	0b0011 or 0b01xx (except 0b0101).
+	In correct versions, these bits should be 0bc0se
+	where c is the MariaDB COMPRESSED flag
+	and e is the MySQL 5.7 ENCRYPTION flag
+	and s is the MySQL 8.0 SDI flag. MariaDB can only support s=0, e=0.
+
+	Compressed innodb_page_size=16k tables with correct FSP_SPACE_FLAGS
+	will be properly rejected by older MariaDB 10.1.x because they
+	would read as PAGE_SSIZE>=8 which is not valid. */
+
+	const uint32_t ssize = FSP_FLAGS_GET_PAGE_SSIZE_MARIADB101(flags);
+	if (ssize == 1 || ssize == 2 || ssize == 5 || ssize & 8) {
+		/* the page_size is not between 4k and 64k;
+		16k should be encoded as 0, not 5 */
+		return UINT32_MAX;
+	}
+	const uint32_t zssize = FSP_FLAGS_GET_ZIP_SSIZE(flags);
+	if (zssize == 0) {
+		/* not ROW_FORMAT=COMPRESSED */
+	} else if (zssize > (ssize ? ssize : 5)) {
+		/* invalid KEY_BLOCK_SIZE */
+		return UINT32_MAX;
+	} else if (~flags & (FSP_FLAGS_MASK_POST_ANTELOPE
+			     | FSP_FLAGS_MASK_ATOMIC_BLOBS)) {
+		/* both these flags should be set for
+		ROW_FORMAT=COMPRESSED */
+		return UINT32_MAX;
+	}
+
+	flags = ((flags & 0x3f) | ssize << FSP_FLAGS_POS_PAGE_SSIZE
+		 | FSP_FLAGS_GET_PAGE_COMPRESSION_MARIADB101(flags)
+		 << FSP_FLAGS_POS_PAGE_COMPRESSION);
+	ut_ad(fil_space_t::is_valid_flags(flags, false));
+	return(flags);
+}
+
+/** Compare tablespace flags.
+@param[in]	expected	expected flags from dict_tf_to_fsp_flags()
+@param[in]	actual		flags read from FSP_SPACE_FLAGS
+@return whether the flags match */
+MY_ATTRIBUTE((warn_unused_result))
+inline bool fsp_flags_match(uint32_t expected, uint32_t actual)
+{
+  expected&= ~FSP_FLAGS_MEM_MASK;
+  ut_ad(fil_space_t::is_valid_flags(expected, false));
+  return actual == expected || fsp_flags_convert_from_101(actual) == expected;
+}
+
+/** Determine if FSP_SPACE_FLAGS are from an incompatible MySQL format.
+@param	flags	the contents of FSP_SPACE_FLAGS
+@return	MySQL flags shifted.
+@retval	0, if not a MySQL incompatible format. */
+MY_ATTRIBUTE((warn_unused_result, const))
+inline uint32_t fsp_flags_is_incompatible_mysql(uint32_t flags)
+{
+  /*
+    MySQL-8.0 SDI flag (bit 14),
+    or MySQL 5.7 Encyption flag (bit 13)
+  */
+  return flags >> 13 & 3;
+}
+
+/** Determine the descriptor index within a descriptor page.
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	offset		page offset
+@return descriptor index */
+inline ulint xdes_calc_descriptor_index(ulint zip_size, ulint offset)
+{
+	return ut_2pow_remainder<ulint>(offset,
+					zip_size ? zip_size : srv_page_size)
+		/ FSP_EXTENT_SIZE;
+}
+
+/** Determine the descriptor page number for a page.
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	offset		page offset
+@return descriptor page offset */
+inline uint32_t xdes_calc_descriptor_page(ulint zip_size, uint32_t offset)
+{
+	compile_time_assert(UNIV_PAGE_SIZE_MAX > XDES_ARR_OFFSET
+			    + (UNIV_PAGE_SIZE_MAX / FSP_EXTENT_SIZE_MAX)
+			    * XDES_SIZE_MAX);
+	compile_time_assert(UNIV_PAGE_SIZE_MIN > XDES_ARR_OFFSET
+			    + (UNIV_PAGE_SIZE_MIN / FSP_EXTENT_SIZE_MIN)
+			    * XDES_SIZE_MIN);
+
+	ut_ad(srv_page_size > XDES_ARR_OFFSET
+	      + (srv_page_size / FSP_EXTENT_SIZE)
+	      * XDES_SIZE);
+	ut_ad(UNIV_ZIP_SIZE_MIN > XDES_ARR_OFFSET
+	      + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE)
+	      * XDES_SIZE);
+	ut_ad(!zip_size
+	      || zip_size > XDES_ARR_OFFSET
+	      + (zip_size / FSP_EXTENT_SIZE) * XDES_SIZE);
+	return ut_2pow_round(offset,
+			     uint32_t(zip_size ? zip_size : srv_page_size));
+}
+
+#endif /* UNIV_INNOCHECKSUM */
+
+#endif
diff --git a/storage/innobase/include/fsp0space.h b/storage/innobase/include/fsp0space.h
new file mode 100644
index 00000000..a2bb46d3
--- /dev/null
+++ b/storage/innobase/include/fsp0space.h
@@ -0,0 +1,209 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0space.h
+Shared tablespace interface
+
+Created 2013-7-26 by Kevin Lewis
+*******************************************************/
+
+#ifndef fsp0space_h
+#define fsp0space_h
+
+#include "fsp0file.h"
+#include "fsp0fsp.h"
+#include "fsp0types.h"
+
+#include <vector>
+
+/** Data structure that contains the information about shared tablespaces.
+Currently this can be the system tablespace or a temporary table tablespace */
+class Tablespace {
+
+public:
+	typedef std::vector<Datafile, ut_allocator<Datafile> >	files_t;
+
+	/** Data file information - each Datafile can be accessed globally */
+	files_t		m_files;
+	/** Data file iterator */
+	typedef files_t::iterator iterator;
+	/** Data file iterator */
+	typedef files_t::const_iterator const_iterator;
+
+	Tablespace() {}
+
+	virtual ~Tablespace()
+	{
+		shutdown();
+		ut_ad(m_files.empty());
+		ut_ad(m_space_id == UINT32_MAX);
+	}
+
+	// Disable copying
+	Tablespace(const Tablespace&);
+	Tablespace& operator=(const Tablespace&);
+
+	/** Data file iterator */
+	const_iterator begin() const { return m_files.begin(); }
+	/** Data file iterator */
+	const_iterator end() const { return m_files.end(); }
+	/** Data file iterator */
+	iterator begin() { return m_files.begin(); }
+	/** Data file iterator */
+	iterator end() { return m_files.end(); }
+
+	/** Set tablespace path and filename members.
+	@param[in]	path	where tablespace file(s) resides
+	@param[in]	len	length of the file path */
+	void set_path(const char* path, size_t len)
+	{
+		ut_ad(m_path == NULL);
+		m_path = mem_strdupl(path, len);
+		ut_ad(m_path != NULL);
+	}
+
+	/** Set tablespace path and filename members.
+	@param[in]	path	where tablespace file(s) resides */
+	void set_path(const char* path)
+	{
+		set_path(path, strlen(path));
+	}
+
+	/** Get tablespace path
+	@return tablespace path */
+	const char* path()	const
+	{
+		return(m_path);
+	}
+
+	/** Set the space id of the tablespace
+	@param[in]	space_id	 tablespace ID to set */
+	void set_space_id(uint32_t space_id)
+	{
+		ut_ad(m_space_id == UINT32_MAX);
+		m_space_id = space_id;
+	}
+
+	/** Get the space id of the tablespace
+	@return m_space_id space id of the tablespace */
+	uint32_t space_id() const { return m_space_id; }
+
+	/** Set the tablespace flags
+	@param[in]	fsp_flags	tablespace flags */
+	void set_flags(uint32_t fsp_flags)
+	{
+		ut_ad(fil_space_t::is_valid_flags(fsp_flags, false));
+		m_flags = fsp_flags;
+	}
+
+	/** Get the tablespace flags
+	@return m_flags tablespace flags */
+	uint32_t flags() const { return m_flags; }
+
+	/** Get the tablespace encryption mode
+	@return m_mode tablespace encryption mode */
+	fil_encryption_t encryption_mode() const { return m_mode; }
+
+	/** Get the tablespace encryption key_id
+	@return m_key_id tablespace encryption key_id */
+	uint32_t key_id() const { return m_key_id; }
+
+	/** Set Ignore Read Only Status for tablespace.
+	@param[in]	read_only_status	read only status indicator */
+	void set_ignore_read_only(bool read_only_status)
+	{
+		m_ignore_read_only = read_only_status;
+	}
+
+	/** Free the memory allocated by the Tablespace object */
+	void shutdown();
+
+	/** @return the sum of the file sizes of each Datafile */
+	uint32_t get_sum_of_sizes() const
+	{
+		uint32_t sum = 0;
+
+		for (const_iterator it = begin(); it != end(); ++it) {
+			sum += it->m_size;
+		}
+
+		return(sum);
+	}
+
+	/** Open or Create the data files if they do not exist.
+	@param[in]	is_temp	whether this is a temporary tablespace
+	@return DB_SUCCESS or error code */
+	dberr_t open_or_create(bool is_temp)
+		MY_ATTRIBUTE((warn_unused_result));
+
+	/** Delete all the data files. */
+	void delete_files();
+
+	/** Check if two tablespaces have common data file names.
+	@param[in]	other_space	Tablespace to check against this.
+	@return true if they have the same data filenames and paths */
+	bool intersection(const Tablespace* other_space);
+
+	/** Use the ADD DATAFILE path to create a Datafile object and add
+	it to the front of m_files. Parse the datafile path into a path
+	and a basename with extension 'ibd'. This datafile_path provided
+	may be an absolute or relative path, but it must end with the
+	extension .ibd and have a basename of at least 1 byte.
+
+	Set tablespace m_path member and add a Datafile with the filename.
+	@param[in]	datafile_path	full path of the tablespace file. */
+	dberr_t add_datafile(
+		const char*	datafile_path);
+
+	/* Return a pointer to the first Datafile for this Tablespace
+	@return pointer to the first Datafile for this Tablespace*/
+	Datafile* first_datafile()
+	{
+		ut_a(!m_files.empty());
+		return(&m_files.front());
+	}
+private:
+	/**
+	@param[in]	filename	Name to lookup in the data files.
+	@return true if the filename exists in the data files */
+	bool find(const char* filename) const;
+
+	/** Note that the data file was found.
+	@param[in]	file	data file object */
+	void file_found(Datafile& file);
+
+	/** Tablespace ID */
+	uint32_t	m_space_id = UINT32_MAX;
+	/** Tablespace flags */
+	uint32_t	m_flags = UINT32_MAX;
+
+	/** Path where tablespace files will reside, excluding a filename */
+	char*		m_path;
+
+	/** Encryption mode and key_id */
+	fil_encryption_t m_mode;
+	uint32_t	m_key_id;
+
+protected:
+	/** Ignore server read only configuration for this tablespace. */
+	bool		m_ignore_read_only = false;
+};
+
+#endif /* fsp0space_h */
diff --git a/storage/innobase/include/fsp0sysspace.h b/storage/innobase/include/fsp0sysspace.h
new file mode 100644
index 00000000..514f3fdb
--- /dev/null
+++ b/storage/innobase/include/fsp0sysspace.h
@@ -0,0 +1,278 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0sysspace.h
+Multi file, shared, system tablespace implementation.
+
+Created 2013-7-26 by Kevin Lewis
+*******************************************************/
+
+#ifndef fsp0sysspace_h
+#define fsp0sysspace_h
+
+#include "fsp0space.h"
+
+/** If the last data file is auto-extended, we add this many pages to it
+at a time. We have to make this public because it is a config variable. */
+extern uint sys_tablespace_auto_extend_increment;
+
+/** Data structure that contains the information about shared tablespaces.
+Currently this can be the system tablespace or a temporary table tablespace */
+class SysTablespace : public Tablespace
+{
+public:
+
+	SysTablespace()
+		:
+		m_auto_extend_last_file(),
+		m_last_file_size_max(),
+		m_created_new_raw(),
+		m_is_tablespace_full(false),
+		m_sanity_checks_done(false)
+	{
+		/* No op */
+	}
+
+	~SysTablespace() override
+	{
+		shutdown();
+	}
+
+	/** Set tablespace full status
+	@param[in]	is_full		true if full */
+	void set_tablespace_full_status(bool is_full)
+	{
+		m_is_tablespace_full = is_full;
+	}
+
+	/** Get tablespace full status
+	@return true if table is full */
+	bool get_tablespace_full_status()
+	{
+		return(m_is_tablespace_full);
+	}
+
+	/** Set sanity check status
+	@param[in]	status	true if sanity checks are done */
+	void set_sanity_check_status(bool status)
+	{
+		m_sanity_checks_done = status;
+	}
+
+	/** Get sanity check status
+	@return true if sanity checks are done */
+	bool get_sanity_check_status()
+	{
+		return(m_sanity_checks_done);
+	}
+
+	/** Parse the input params and populate member variables.
+	@param	filepath	path to data files
+	@param	supports_raw	true if it supports raw devices
+	@return true on success parse */
+	bool parse_params(const char* filepath, bool supports_raw);
+
+	/** Check the data file specification.
+	@param[out]	create_new_db		true if a new database
+	is to be created
+	@param[in]	min_expected_size	expected tablespace
+	size in bytes
+	@return DB_SUCCESS if all OK else error code */
+	dberr_t check_file_spec(
+		bool*	create_new_db,
+		ulint	min_expected_tablespace_size);
+
+	/** Free the memory allocated by parse() */
+	void shutdown();
+
+	/** Normalize the file size, convert to extents. */
+	void normalize_size();
+
+	/**
+	@return true if a new raw device was created. */
+	bool created_new_raw() const
+	{
+		return(m_created_new_raw);
+	}
+
+	/**
+	@return auto_extend value setting */
+	ulint can_auto_extend_last_file() const
+	{
+		return(m_auto_extend_last_file);
+	}
+
+	/** Set the last file size.
+	@param[in]	size	the size to set */
+	void set_last_file_size(uint32_t size)
+	{
+		ut_ad(!m_files.empty());
+		m_files.back().m_size = size;
+	}
+
+	/** Get the size of the last data file in the tablespace
+	@return the size of the last data file in the array */
+	uint32_t last_file_size() const
+	{
+		ut_ad(!m_files.empty());
+		return(m_files.back().m_size);
+	}
+
+	/**
+	@return the autoextend increment in pages. */
+	uint32_t get_autoextend_increment() const
+	{
+		return sys_tablespace_auto_extend_increment
+			<< (20 - srv_page_size_shift);
+	}
+
+	/**
+	@return next increment size */
+	uint32_t get_increment() const;
+
+	/** Open or create the data files
+	@param[in]  is_temp		whether this is a temporary tablespace
+	@param[in]  create_new_db	whether we are creating a new database
+	@param[out] sum_new_sizes	sum of sizes of the new files added
+	@return DB_SUCCESS or error code */
+	dberr_t open_or_create(
+		bool	is_temp,
+		bool	create_new_db,
+		ulint*	sum_new_sizes)
+		MY_ATTRIBUTE((warn_unused_result));
+
+private:
+	/** Check the tablespace header for this tablespace.
+	@return DB_SUCCESS or error code */
+	inline dberr_t read_lsn_and_check_flags();
+
+	/**
+	@return true if the last file size is valid. */
+	bool is_valid_size() const
+	{
+		return(m_last_file_size_max >= last_file_size());
+	}
+
+	/**
+	@return true if configured to use raw devices */
+	bool has_raw_device();
+
+	/** Note that the data file was not found.
+	@param[in]	file		data file object
+	@param[out]	create_new_db	true if a new instance to be created
+	@return DB_SUCESS or error code */
+	dberr_t file_not_found(Datafile& file, bool* create_new_db);
+
+	/** Note that the data file was found.
+	@param[in,out]	file	data file object
+	@return true if a new instance to be created */
+	bool file_found(Datafile& file);
+
+	/** Create a data file.
+	@param[in,out]	file	data file object
+	@return DB_SUCCESS or error code */
+	dberr_t create(Datafile& file);
+
+	/** Create a data file.
+	@param[in,out]	file	data file object
+	@return DB_SUCCESS or error code */
+	dberr_t create_file(Datafile& file);
+
+	/** Open a data file.
+	@param[in,out]	file	data file object
+	@return DB_SUCCESS or error code */
+	dberr_t open_file(Datafile& file);
+
+	/** Set the size of the file.
+	@param[in,out]	file	data file object
+	@return DB_SUCCESS or error code */
+	dberr_t set_size(Datafile& file);
+
+	/** Convert a numeric string that optionally ends in G or M, to a
+	number containing megabytes.
+	@param[in]	ptr	string with a quantity in bytes
+	@param[out]	megs	the number in megabytes
+	@return next character in string */
+	static char* parse_units(char* ptr, ulint* megs);
+
+private:
+	enum file_status_t {
+		FILE_STATUS_VOID = 0,		/** status not set */
+		FILE_STATUS_RW_PERMISSION_ERROR,/** permission error */
+		FILE_STATUS_READ_WRITE_ERROR,	/** not readable/writable */
+		FILE_STATUS_NOT_REGULAR_FILE_ERROR /** not a regular file */
+	};
+
+	/** Verify the size of the physical file
+	@param[in]	file	data file object
+	@return DB_SUCCESS if OK else error code. */
+	dberr_t check_size(Datafile& file);
+
+	/** Check if a file can be opened in the correct mode.
+	@param[in,out]	file	data file object
+	@param[out]	reason	exact reason if file_status check failed.
+	@return DB_SUCCESS or error code. */
+	dberr_t check_file_status(
+		const Datafile& 	file,
+		file_status_t& 		reason);
+
+	/* DATA MEMBERS */
+
+	/** if true, then we auto-extend the last data file */
+	bool		m_auto_extend_last_file;
+
+	/** maximum size of the last data file (0=unlimited) */
+	ulint		m_last_file_size_max;
+
+	/** If the following is true we do not allow
+	inserts etc. This protects the user from forgetting
+	the 'newraw' keyword to my.cnf */
+	bool		m_created_new_raw;
+
+	/** Tablespace full status */
+	bool		m_is_tablespace_full;
+
+	/** if false, then sanity checks are still pending */
+	bool		m_sanity_checks_done;
+};
+
+/* GLOBAL OBJECTS */
+
+/** The control info of the system tablespace. */
+extern SysTablespace srv_sys_space;
+
+/** The control info of a temporary table shared tablespace. */
+extern SysTablespace srv_tmp_space;
+
+/** Check if the space_id is for a system-tablespace (shared + temp).
+@param[in]	id	Space ID to check
+@return true if id is a system tablespace, false if not. */
+inline bool is_system_tablespace(uint32_t id)
+{
+  return id == TRX_SYS_SPACE || id == SRV_TMP_SPACE_ID;
+}
+
+/** Check if predefined shared tablespace.
+@return true if predefined shared tablespace */
+inline bool is_predefined_tablespace(uint32_t id)
+{
+  return is_system_tablespace(id) || srv_is_undo_tablespace(id);
+}
+#endif /* fsp0sysspace_h */
diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h
new file mode 100644
index 00000000..9a23e840
--- /dev/null
+++ b/storage/innobase/include/fsp0types.h
@@ -0,0 +1,404 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************
+@file include/fsp0types.h
+File space management types
+
+Created May 26, 2009 Vasil Dimov
+*******************************************************/
+
+#pragma once
+#include "ut0byte.h"
+
+/** All persistent tablespaces have a smaller fil_space_t::id than this. */
+constexpr uint32_t SRV_SPACE_ID_UPPER_BOUND= 0xFFFFFFF0U;
+/** The fil_space_t::id of the innodb_temporary tablespace. */
+constexpr uint32_t SRV_TMP_SPACE_ID= 0xFFFFFFFEU;
+
+/* Possible values of innodb_compression_algorithm */
+#define PAGE_UNCOMPRESSED		0
+#define PAGE_ZLIB_ALGORITHM		1
+#define PAGE_LZ4_ALGORITHM		2
+#define PAGE_LZO_ALGORITHM		3
+#define PAGE_LZMA_ALGORITHM		4
+#define PAGE_BZIP2_ALGORITHM	5
+#define PAGE_SNAPPY_ALGORITHM	6
+#define PAGE_ALGORITHM_LAST		PAGE_SNAPPY_ALGORITHM
+
+extern const char *page_compression_algorithms[];
+
+/** @name Flags for inserting records in order
+If records are inserted in order, there are the following
+flags to tell this (their type is made byte for the compiler
+to warn if direction and hint parameters are switched in
+fseg_alloc_free_page_general) */
+/* @{ */
+#define	FSP_UP		((byte)111)	/*!< alphabetically upwards */
+#define	FSP_DOWN	((byte)112)	/*!< alphabetically downwards */
+#define	FSP_NO_DIR	((byte)113)	/*!< no order */
+/* @} */
+
+/** File space extent size in pages
+page size | file space extent size
+----------+-----------------------
+   4 KiB  | 256 pages = 1 MiB
+   8 KiB  | 128 pages = 1 MiB
+  16 KiB  |  64 pages = 1 MiB
+  32 KiB  |  64 pages = 2 MiB
+  64 KiB  |  64 pages = 4 MiB
+*/
+#define FSP_EXTENT_SIZE         (srv_page_size_shift < 14 ?	\
+				 (1048576U >> srv_page_size_shift) : 64U)
+
+/** File space extent size (four megabyte) in pages for MAX page size */
+#define	FSP_EXTENT_SIZE_MAX	(4194304 / UNIV_PAGE_SIZE_MAX)
+
+/** File space extent size (one megabyte) in pages for MIN page size */
+#define	FSP_EXTENT_SIZE_MIN	(1048576 / UNIV_PAGE_SIZE_MIN)
+
+/** On a page of any file segment, data may be put starting from this
+offset */
+#define FSEG_PAGE_DATA		FIL_PAGE_DATA
+
+/** @name File segment header
+The file segment header points to the inode describing the file segment. */
+/* @{ */
+/** Data type for file segment header */
+typedef	byte	fseg_header_t;
+
+#define FSEG_HDR_SPACE		0	/*!< space id of the inode */
+#define FSEG_HDR_PAGE_NO	4	/*!< page number of the inode */
+#define FSEG_HDR_OFFSET		8	/*!< byte offset of the inode */
+
+#define FSEG_HEADER_SIZE	10	/*!< Length of the file system
+					header, in bytes */
+/* @} */
+
+#ifndef UNIV_INNOCHECKSUM
+#ifdef UNIV_DEBUG
+
+struct mtr_t;
+
+/** A wrapper class to print the file segment header information. */
+class fseg_header
+{
+public:
+	/** Constructor of fseg_header.
+	@param[in]	header	the underlying file segment header object
+	@param[in]	mtr	the mini-transaction.  No redo logs are
+				generated, only latches are checked within
+				mini-transaction */
+	fseg_header(
+		const fseg_header_t*	header,
+		mtr_t*			mtr)
+		:
+		m_header(header),
+		m_mtr(mtr)
+	{}
+
+	/** Print the file segment header to the given output stream.
+	@param[in,out]	out	the output stream into which the object
+				is printed.
+	@retval	the output stream into which the object was printed. */
+	std::ostream&
+	to_stream(std::ostream&	out) const;
+private:
+	/** The underlying file segment header */
+	const fseg_header_t*	m_header;
+
+	/** The mini transaction, which is used mainly to check whether
+	appropriate latches have been taken by the calling thread. */
+	mtr_t*			m_mtr;
+};
+
+/* Overloading the global output operator to print a file segment header
+@param[in,out]	out	the output stream into which object will be printed
+@param[in]	header	the file segment header to be printed
+@retval the output stream */
+inline
+std::ostream&
+operator<<(
+	std::ostream&		out,
+	const fseg_header&	header)
+{
+	return(header.to_stream(out));
+}
+#endif /* UNIV_DEBUG */
+
+/** Flags for fsp_reserve_free_extents */
+enum fsp_reserve_t {
+	FSP_NORMAL,	/* reservation during normal B-tree operations */
+	FSP_UNDO,	/* reservation done for undo logging */
+	FSP_CLEANING,	/* reservation done during purge operations */
+	FSP_BLOB	/* reservation being done for BLOB insertion */
+};
+
+/* Number of pages described in a single descriptor page: currently each page
+description takes less than 1 byte; a descriptor page is repeated every
+this many file pages */
+/* #define XDES_DESCRIBED_PER_PAGE		srv_page_size */
+/* This has been replaced with either srv_page_size or page_zip->size. */
+
+/** @name The space low address page map
+The pages at FSP_XDES_OFFSET and FSP_IBUF_BITMAP_OFFSET are repeated
+every XDES_DESCRIBED_PER_PAGE pages in every tablespace. */
+/* @{ */
+/*--------------------------------------*/
+#define FSP_XDES_OFFSET			0U	/* !< extent descriptor */
+#define FSP_IBUF_BITMAP_OFFSET		1U	/* !< insert buffer bitmap */
+				/* The ibuf bitmap pages are the ones whose
+				page number is the number above plus a
+				multiple of XDES_DESCRIBED_PER_PAGE */
+
+#define FSP_FIRST_INODE_PAGE_NO		2U	/*!< in every tablespace */
+				/* The following pages exist
+				in the system tablespace (space 0). */
+#define FSP_IBUF_HEADER_PAGE_NO		3U	/*!< insert buffer
+						header page, in
+						tablespace 0 */
+#define FSP_IBUF_TREE_ROOT_PAGE_NO	4U	/*!< insert buffer
+						B-tree root page in
+						tablespace 0 */
+				/* The ibuf tree root page number in
+				tablespace 0; its fseg inode is on the page
+				number FSP_FIRST_INODE_PAGE_NO */
+#define FSP_TRX_SYS_PAGE_NO		5U	/*!< transaction
+						system header, in
+						tablespace 0 */
+#define	FSP_FIRST_RSEG_PAGE_NO		6U	/*!< first rollback segment
+						page, in tablespace 0 */
+#define FSP_DICT_HDR_PAGE_NO		7U	/*!< data dictionary header
+						page, in tablespace 0 */
+/*--------------------------------------*/
+/* @} */
+
+/** Check if tablespace is system temporary.
+@param[in]      space_id        verify is checksum is enabled for given space.
+@return true if tablespace is system temporary. */
+inline
+bool
+fsp_is_system_temporary(ulint	space_id)
+{
+	return(space_id == SRV_TMP_SPACE_ID);
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/* @defgroup fsp_flags InnoDB Tablespace Flag Constants @{ */
+
+/** Width of the POST_ANTELOPE flag */
+#define FSP_FLAGS_WIDTH_POST_ANTELOPE	1
+/** Number of flag bits used to indicate the tablespace zip page size */
+#define FSP_FLAGS_WIDTH_ZIP_SSIZE	4
+/** Width of the ATOMIC_BLOBS flag.  The ability to break up a long
+column into an in-record prefix and an externally stored part is available
+to ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT. */
+#define FSP_FLAGS_WIDTH_ATOMIC_BLOBS	1
+/** Number of flag bits used to indicate the tablespace page size */
+#define FSP_FLAGS_WIDTH_PAGE_SSIZE	4
+/** Number of reserved bits */
+#define FSP_FLAGS_WIDTH_RESERVED 6
+/** Number of flag bits used to indicate the page compression */
+#define FSP_FLAGS_WIDTH_PAGE_COMPRESSION 1
+
+/** Width of all the currently known persistent tablespace flags */
+#define FSP_FLAGS_WIDTH		(FSP_FLAGS_WIDTH_POST_ANTELOPE	\
+				+ FSP_FLAGS_WIDTH_ZIP_SSIZE	\
+				+ FSP_FLAGS_WIDTH_ATOMIC_BLOBS	\
+				+ FSP_FLAGS_WIDTH_PAGE_SSIZE	\
+				+ FSP_FLAGS_WIDTH_RESERVED	\
+				+ FSP_FLAGS_WIDTH_PAGE_COMPRESSION)
+
+/** A mask of all the known/used bits in FSP_SPACE_FLAGS */
+#define FSP_FLAGS_MASK		(~(~0U << FSP_FLAGS_WIDTH))
+
+/** Number of flag bits used to indicate the tablespace page size */
+#define FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE	4
+
+/** Marker to indicate whether tablespace is in full checksum format. */
+#define FSP_FLAGS_FCRC32_WIDTH_MARKER		1
+
+/** Stores the compressed algo for full checksum format. */
+#define FSP_FLAGS_FCRC32_WIDTH_COMPRESSED_ALGO	3
+
+/* FSP_SPACE_FLAGS position and name in MySQL 5.6/MariaDB 10.0 or older
+and MariaDB 10.1.20 or older MariaDB 10.1 and in MariaDB 10.1.21
+or newer.
+MySQL 5.6		MariaDB 10.1.x		MariaDB 10.1.21
+====================================================================
+Below flags in same offset
+====================================================================
+0: POST_ANTELOPE	0:POST_ANTELOPE		0: POST_ANTELOPE
+1..4: ZIP_SSIZE(0..5)	1..4:ZIP_SSIZE(0..5)	1..4: ZIP_SSIZE(0..5)
+(NOTE: bit 4 is always 0)
+5: ATOMIC_BLOBS    	5:ATOMIC_BLOBS		5: ATOMIC_BLOBS
+=====================================================================
+Below note the order difference:
+=====================================================================
+6..9: PAGE_SSIZE(3..7)	6: COMPRESSION		6..9: PAGE_SSIZE(3..7)
+10: DATA_DIR		7..10: COMP_LEVEL(0..9)	10: RESERVED (5.6 DATA_DIR)
+=====================================================================
+The flags below were in incorrect position in MariaDB 10.1,
+or have been introduced in MySQL 5.7 or 8.0:
+=====================================================================
+11: UNUSED		11..12:ATOMIC_WRITES	11: RESERVED (5.7 SHARED)
+						12: RESERVED (5.7 TEMPORARY)
+			13..15:PAGE_SSIZE(3..7)	13: RESERVED (5.7 ENCRYPTION)
+						14: RESERVED (8.0 SDI)
+						15: RESERVED
+			16: PAGE_SSIZE_msb(0)	16: COMPRESSION
+			17: DATA_DIR		17: UNUSED
+			18: UNUSED
+=====================================================================
+The flags below only exist in fil_space_t::flags, not in FSP_SPACE_FLAGS:
+=====================================================================
+						27: DATA_DIR
+						28..31: COMPRESSION_LEVEL
+*/
+
+/** A mask of the memory-only flags in fil_space_t::flags */
+#define FSP_FLAGS_MEM_MASK		(~0U << FSP_FLAGS_MEM_DATA_DIR)
+
+/** Zero relative shift position of the DATA_DIR flag */
+#define FSP_FLAGS_MEM_DATA_DIR		27
+/** Zero relative shift position of the COMPRESSION_LEVEL field */
+#define FSP_FLAGS_MEM_COMPRESSION_LEVEL	28
+
+/** Zero relative shift position of the POST_ANTELOPE field */
+#define FSP_FLAGS_POS_POST_ANTELOPE	0
+/** Zero relative shift position of the ZIP_SSIZE field */
+#define FSP_FLAGS_POS_ZIP_SSIZE		(FSP_FLAGS_POS_POST_ANTELOPE	\
+					+ FSP_FLAGS_WIDTH_POST_ANTELOPE)
+/** Zero relative shift position of the ATOMIC_BLOBS field */
+#define FSP_FLAGS_POS_ATOMIC_BLOBS	(FSP_FLAGS_POS_ZIP_SSIZE	\
+					+ FSP_FLAGS_WIDTH_ZIP_SSIZE)
+/** Zero relative shift position of the start of the PAGE_SSIZE bits */
+#define FSP_FLAGS_POS_PAGE_SSIZE	(FSP_FLAGS_POS_ATOMIC_BLOBS	\
+                                        + FSP_FLAGS_WIDTH_ATOMIC_BLOBS)
+/** Zero relative shift position of the start of the RESERVED bits
+these are only used in MySQL 5.7 and used for compatibility. */
+#define FSP_FLAGS_POS_RESERVED		(FSP_FLAGS_POS_PAGE_SSIZE	\
+					+ FSP_FLAGS_WIDTH_PAGE_SSIZE)
+/** Zero relative shift position of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_POS_PAGE_COMPRESSION	(FSP_FLAGS_POS_RESERVED \
+					+ FSP_FLAGS_WIDTH_RESERVED)
+
+/** Zero relative shift position of the PAGE_SIZE field
+in full crc32 format */
+#define FSP_FLAGS_FCRC32_POS_PAGE_SSIZE	0
+
+/** Zero relative shift position of the MARKER field in full crc32 format. */
+#define FSP_FLAGS_FCRC32_POS_MARKER	(FSP_FLAGS_FCRC32_POS_PAGE_SSIZE \
+					 + FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE)
+
+/** Zero relative shift position of the compressed algorithm stored
+in full crc32 format. */
+#define FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO	(FSP_FLAGS_FCRC32_POS_MARKER \
+						 + FSP_FLAGS_FCRC32_WIDTH_MARKER)
+
+/** Bit mask of the POST_ANTELOPE field */
+#define FSP_FLAGS_MASK_POST_ANTELOPE				\
+		((~(~0U << FSP_FLAGS_WIDTH_POST_ANTELOPE))	\
+		<< FSP_FLAGS_POS_POST_ANTELOPE)
+/** Bit mask of the ZIP_SSIZE field */
+#define FSP_FLAGS_MASK_ZIP_SSIZE				\
+		((~(~0U << FSP_FLAGS_WIDTH_ZIP_SSIZE))		\
+		<< FSP_FLAGS_POS_ZIP_SSIZE)
+/** Bit mask of the ATOMIC_BLOBS field */
+#define FSP_FLAGS_MASK_ATOMIC_BLOBS				\
+		((~(~0U << FSP_FLAGS_WIDTH_ATOMIC_BLOBS))	\
+		<< FSP_FLAGS_POS_ATOMIC_BLOBS)
+/** Bit mask of the PAGE_SSIZE field */
+#define FSP_FLAGS_MASK_PAGE_SSIZE				\
+		((~(~0U << FSP_FLAGS_WIDTH_PAGE_SSIZE))		\
+		<< FSP_FLAGS_POS_PAGE_SSIZE)
+/** Bit mask of the RESERVED1 field */
+#define FSP_FLAGS_MASK_RESERVED					\
+		((~(~0U << FSP_FLAGS_WIDTH_RESERVED))		\
+		<< FSP_FLAGS_POS_RESERVED)
+/** Bit mask of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_MASK_PAGE_COMPRESSION				\
+		((~(~0U << FSP_FLAGS_WIDTH_PAGE_COMPRESSION))	\
+		<< FSP_FLAGS_POS_PAGE_COMPRESSION)
+
+/** Bit mask of the in-memory COMPRESSION_LEVEL field */
+#define FSP_FLAGS_MASK_MEM_COMPRESSION_LEVEL			\
+		(15U << FSP_FLAGS_MEM_COMPRESSION_LEVEL)
+
+/** Bit mask of the PAGE_SIZE field in full crc32 format */
+#define FSP_FLAGS_FCRC32_MASK_PAGE_SSIZE			\
+		((~(~0U << FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE))	\
+		<< FSP_FLAGS_FCRC32_POS_PAGE_SSIZE)
+
+/** Bit mask of the MARKER field in full crc32 format */
+#define FSP_FLAGS_FCRC32_MASK_MARKER				\
+		((~(~0U << FSP_FLAGS_FCRC32_WIDTH_MARKER))	\
+		<< FSP_FLAGS_FCRC32_POS_MARKER)
+
+/** Bit mask of the COMPRESSED ALGO field in full crc32 format */
+#define FSP_FLAGS_FCRC32_MASK_COMPRESSED_ALGO			\
+		((~(~0U << FSP_FLAGS_FCRC32_WIDTH_COMPRESSED_ALGO))	\
+		<< FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO)
+
+/** Return the value of the POST_ANTELOPE field */
+#define FSP_FLAGS_GET_POST_ANTELOPE(flags)			\
+		((flags & FSP_FLAGS_MASK_POST_ANTELOPE)		\
+		>> FSP_FLAGS_POS_POST_ANTELOPE)
+/** Return the value of the ZIP_SSIZE field */
+#define FSP_FLAGS_GET_ZIP_SSIZE(flags)				\
+		((flags & FSP_FLAGS_MASK_ZIP_SSIZE)		\
+		>> FSP_FLAGS_POS_ZIP_SSIZE)
+/** Return the value of the ATOMIC_BLOBS field */
+#define FSP_FLAGS_HAS_ATOMIC_BLOBS(flags)			\
+		((flags & FSP_FLAGS_MASK_ATOMIC_BLOBS)		\
+		>> FSP_FLAGS_POS_ATOMIC_BLOBS)
+/** Return the value of the PAGE_SSIZE field */
+#define FSP_FLAGS_GET_PAGE_SSIZE(flags)				\
+		((flags & FSP_FLAGS_MASK_PAGE_SSIZE)		\
+		>> FSP_FLAGS_POS_PAGE_SSIZE)
+/** @return the RESERVED flags */
+#define FSP_FLAGS_GET_RESERVED(flags)				\
+		((flags & FSP_FLAGS_MASK_RESERVED)		\
+		>> FSP_FLAGS_POS_RESERVED)
+/** @return the PAGE_COMPRESSION flag */
+#define FSP_FLAGS_HAS_PAGE_COMPRESSION(flags)			\
+		((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION)	\
+		>> FSP_FLAGS_POS_PAGE_COMPRESSION)
+/** @return the PAGE_SSIZE flags in full crc32 format */
+#define FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags)			\
+		((flags & FSP_FLAGS_FCRC32_MASK_PAGE_SSIZE)	\
+		>> FSP_FLAGS_FCRC32_POS_PAGE_SSIZE)
+/** @return the COMPRESSED_ALGO flags in full crc32 format */
+#define FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO(flags)			\
+		((flags & FSP_FLAGS_FCRC32_MASK_COMPRESSED_ALGO)	\
+		>> FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO)
+
+/** @return the value of the DATA_DIR field */
+#define FSP_FLAGS_HAS_DATA_DIR(flags)				\
+	(flags & 1U << FSP_FLAGS_MEM_DATA_DIR)
+/** @return the COMPRESSION_LEVEL field */
+#define FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags)		\
+	((flags & FSP_FLAGS_MASK_MEM_COMPRESSION_LEVEL)		\
+	 >> FSP_FLAGS_MEM_COMPRESSION_LEVEL)
+
+/* @} */
+
+struct fil_node_t;
+struct fil_space_t;
+class buf_page_t;
diff --git a/storage/innobase/include/fts0ast.h b/storage/innobase/include/fts0ast.h
new file mode 100644
index 00000000..15bf30bc
--- /dev/null
+++ b/storage/innobase/include/fts0ast.h
@@ -0,0 +1,340 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0ast.h
+The FTS query parser (AST) abstract syntax tree routines
+
+Created 2007/03/16/03 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FST0AST_H
+#define INNOBASE_FST0AST_H
+
+#include "mem0mem.h"
+
+/* The type of AST Node */
+enum fts_ast_type_t {
+	FTS_AST_OPER,				/*!< Operator */
+	FTS_AST_NUMB,				/*!< Number */
+	FTS_AST_TERM,				/*!< Term (or word) */
+	FTS_AST_TEXT,				/*!< Text string */
+	FTS_AST_PARSER_PHRASE_LIST,		/*!< Phase for plugin parser
+						The difference from text type
+						is that we tokenize text into
+						term list */
+	FTS_AST_LIST,				/*!< Expression list */
+	FTS_AST_SUBEXP_LIST			/*!< Sub-Expression list */
+};
+
+/* The FTS query operators that we support */
+enum fts_ast_oper_t {
+	FTS_NONE,				/*!< No operator */
+
+	FTS_IGNORE,				/*!< Ignore rows that contain
+						this word */
+
+	FTS_EXIST,				/*!< Include rows that contain
+						this word */
+
+	FTS_NEGATE,				/*!< Include rows that contain
+						this word but rank them
+						lower*/
+
+	FTS_INCR_RATING,			/*!< Increase the rank for this
+						word*/
+
+	FTS_DECR_RATING,			/*!< Decrease the rank for this
+						word*/
+
+	FTS_DISTANCE,				/*!< Proximity distance */
+	FTS_IGNORE_SKIP,			/*!< Transient node operator
+						signifies that this is a
+						FTS_IGNORE node, and ignored in
+						the first pass of
+						fts_ast_visit() */
+	FTS_EXIST_SKIP				/*!< Transient node operator
+						signifies that this ia a
+						FTS_EXIST node, and ignored in
+						the first pass of
+						fts_ast_visit() */
+};
+
+/* Data types used by the FTS parser */
+struct fts_lexer_t;
+struct fts_ast_node_t;
+struct fts_ast_state_t;
+struct fts_ast_string_t;
+
+typedef dberr_t (*fts_ast_callback)(fts_ast_oper_t, fts_ast_node_t*, void*);
+
+/********************************************************************
+Parse the string using the lexer setup within state.*/
+int
+fts_parse(
+/*======*/
+						/* out: 0 on OK, 1 on error */
+	fts_ast_state_t* state);		/*!< in: ast state instance.*/
+
+/********************************************************************
+Create an AST operator node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_oper(
+/*=====================*/
+	void*		arg,			/*!< in: ast state */
+	fts_ast_oper_t	oper);			/*!< in: ast operator */
+/********************************************************************
+Create an AST term node, makes a copy of ptr */
+extern
+fts_ast_node_t*
+fts_ast_create_node_term(
+/*=====================*/
+	void*			arg,		/*!< in: ast state */
+	const fts_ast_string_t*	ptr);		/*!< in: term string */
+/********************************************************************
+Create an AST text node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_text(
+/*=====================*/
+	void*			arg,		/*!< in: ast state */
+	const fts_ast_string_t*	ptr);		/*!< in: text string */
+/********************************************************************
+Create an AST expr list node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_list(
+/*=====================*/
+	void*		arg,			/*!< in: ast state */
+	fts_ast_node_t*	expr);			/*!< in: ast expr */
+/********************************************************************
+Create a sub-expression list node. This function takes ownership of
+expr and is responsible for deleting it. */
+extern
+fts_ast_node_t*
+fts_ast_create_node_subexp_list(
+/*============================*/
+						/* out: new node */
+	void*		arg,			/*!< in: ast state instance */
+	fts_ast_node_t*	expr);			/*!< in: ast expr instance */
+/********************************************************************
+Set the wildcard attribute of a term.*/
+extern
+void
+fts_ast_term_set_wildcard(
+/*======================*/
+	fts_ast_node_t*	node);			/*!< in: term to change */
+/********************************************************************
+Set the proximity attribute of a text node. */
+void
+fts_ast_text_set_distance(
+/*======================*/
+	fts_ast_node_t*	node,			/*!< in/out: text node */
+	ulint		distance);		/*!< in: the text proximity
+						distance */
+/********************************************************************//**
+Free a fts_ast_node_t instance.
+@return next node to free */
+fts_ast_node_t*
+fts_ast_free_node(
+/*==============*/
+	fts_ast_node_t*	node);			/*!< in: node to free */
+/********************************************************************
+Add a sub-expression to an AST*/
+extern
+fts_ast_node_t*
+fts_ast_add_node(
+/*=============*/
+	fts_ast_node_t*	list,			/*!< in: list node instance */
+	fts_ast_node_t*	node);			/*!< in: (sub) expr to add */
+/********************************************************************
+Print the AST node recursively.*/
+extern
+void
+fts_ast_node_print(
+/*===============*/
+	fts_ast_node_t*	node);			/*!< in: ast node to print */
+/********************************************************************
+Free node and expr allocations.*/
+extern
+void
+fts_ast_state_free(
+/*===============*/
+	fts_ast_state_t*state);			/*!< in: state instance
+						to free */
+/** Check only union operation involved in the node
+@param[in]	node	ast node to check
+@return true if the node contains only union else false. */
+bool
+fts_ast_node_check_union(
+	fts_ast_node_t*	node);
+
+/******************************************************************//**
+Traverse the AST - in-order traversal.
+@return DB_SUCCESS if all went well */
+dberr_t
+fts_ast_visit(
+/*==========*/
+	fts_ast_oper_t		oper,		/*!< in: FTS operator */
+	fts_ast_node_t*		node,		/*!< in: instance to traverse*/
+	fts_ast_callback	visitor,	/*!< in: callback */
+	void*			arg,		/*!< in: callback arg */
+	bool*			has_ignore)	/*!< out: whether we encounter
+						and ignored processing an
+						operator, currently we only
+						ignore FTS_IGNORE operator */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************
+Create a lex instance.*/
+fts_lexer_t*
+fts_lexer_create(
+/*=============*/
+	ibool		boolean_mode,		/*!< in: query type */
+	const byte*	query,			/*!< in: query string */
+	ulint		query_len)		/*!< in: query string len */
+	MY_ATTRIBUTE((nonnull, malloc, warn_unused_result));
+/********************************************************************
+Free an fts_lexer_t instance.*/
+void
+fts_lexer_free(
+/*===========*/
+	fts_lexer_t*	fts_lexer)		/*!< in: lexer instance to
+						free */
+	MY_ATTRIBUTE((nonnull));
+
+/**
+Create an ast string object, with NUL-terminator, so the string
+has one more byte than len
+@param[in] str		pointer to string
+@param[in] len		length of the string
+@return ast string with NUL-terminator */
+fts_ast_string_t*
+fts_ast_string_create(
+	const byte*	str,
+	ulint		len);
+
+/**
+Free an ast string instance
+@param[in,out] ast_str		string to free */
+void
+fts_ast_string_free(
+	fts_ast_string_t*	ast_str);
+
+/**
+Translate ast string of type FTS_AST_NUMB to unsigned long by strtoul
+@param[in] str		string to translate
+@param[in] base		the base
+@return translated number */
+ulint
+fts_ast_string_to_ul(
+	const fts_ast_string_t*	ast_str,
+	int			base);
+
+/* String of length len.
+We always store the string of length len with a terminating '\0',
+regardless of there is any 0x00 in the string itself */
+struct fts_ast_string_t {
+	/*!< Pointer to string. */
+	byte*		str;
+
+	/*!< Length of the string. */
+	ulint		len;
+};
+
+/* Query term type */
+struct fts_ast_term_t {
+	fts_ast_string_t*	ptr;		/*!< Pointer to term string.*/
+	ibool			wildcard;	/*!< TRUE if wild card set.*/
+};
+
+/* Query text type */
+struct fts_ast_text_t {
+	fts_ast_string_t*	ptr;		/*!< Pointer to text string.*/
+	ulint			distance;	/*!< > 0 if proximity distance
+						set */
+};
+
+/* The list of nodes in an expr list */
+struct fts_ast_list_t {
+	fts_ast_node_t*	head;			/*!< Children list head */
+	fts_ast_node_t*	tail;			/*!< Children list tail */
+};
+
+/* FTS AST node to store the term, text, operator and sub-expressions.*/
+struct fts_ast_node_t {
+	fts_ast_type_t	type;			/*!< The type of node */
+	fts_ast_text_t	text;			/*!< Text node */
+	fts_ast_term_t	term;			/*!< Term node */
+	fts_ast_oper_t	oper;			/*!< Operator value */
+	fts_ast_list_t	list;			/*!< Expression list */
+	fts_ast_node_t*	next;			/*!< Link for expr list */
+	fts_ast_node_t*	next_alloc;		/*!< For tracking allocations */
+	bool		visited;		/*!< whether this node is
+						already processed */
+	/** current transaction */
+	const trx_t*	trx;
+	/* Used by plugin parser */
+	fts_ast_node_t* up_node;		/*!< Direct up node */
+	bool		go_up;			/*!< Flag if go one level up */
+};
+
+/* To track state during parsing */
+struct fts_ast_state_t {
+	mem_heap_t*	heap;			/*!< Heap to use for alloc */
+	fts_ast_node_t*	root;			/*!< If all goes OK, then this
+						will point to the root.*/
+
+	fts_ast_list_t	list;			/*!< List of nodes allocated */
+
+	fts_lexer_t*	lexer;			/*!< Lexer callback + arg */
+	CHARSET_INFO*	charset;		/*!< charset used for
+						tokenization */
+	/* Used by plugin parser */
+	fts_ast_node_t*	cur_node;		/*!< Current node into which
+						 we add new node */
+	int		depth;			/*!< Depth of parsing state */
+};
+
+/******************************************************************//**
+Create an AST term node, makes a copy of ptr for plugin parser
+@return node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_term_for_parser(
+/*==========i=====================*/
+	void*		arg,			/*!< in: ast state */
+	const char*	ptr,			/*!< in: term string */
+	const ulint	len);			/*!< in: term string length */
+
+/******************************************************************//**
+Create an AST phrase list node for plugin parser
+@return node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_phrase_list(
+/*============================*/
+	void*		arg);			/*!< in: ast state */
+
+#ifdef UNIV_DEBUG
+const char*
+fts_ast_node_type_get(fts_ast_type_t	type);
+#endif /* UNIV_DEBUG */
+
+#endif /* INNOBASE_FSTS0AST_H */
diff --git a/storage/innobase/include/fts0blex.h b/storage/innobase/include/fts0blex.h
new file mode 100644
index 00000000..b16e7f2c
--- /dev/null
+++ b/storage/innobase/include/fts0blex.h
@@ -0,0 +1,702 @@
+#ifndef fts0bHEADER_H
+#define fts0bHEADER_H 1
+#define fts0bIN_HEADER 1
+
+#line 6 "../include/fts0blex.h"
+
+#line 8 "../include/fts0blex.h"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 6
+#define YY_FLEX_SUBMINOR_VERSION 4
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+#ifdef yy_create_buffer
+#define fts0b_create_buffer_ALREADY_DEFINED
+#else
+#define yy_create_buffer fts0b_create_buffer
+#endif
+
+#ifdef yy_delete_buffer
+#define fts0b_delete_buffer_ALREADY_DEFINED
+#else
+#define yy_delete_buffer fts0b_delete_buffer
+#endif
+
+#ifdef yy_scan_buffer
+#define fts0b_scan_buffer_ALREADY_DEFINED
+#else
+#define yy_scan_buffer fts0b_scan_buffer
+#endif
+
+#ifdef yy_scan_string
+#define fts0b_scan_string_ALREADY_DEFINED
+#else
+#define yy_scan_string fts0b_scan_string
+#endif
+
+#ifdef yy_scan_bytes
+#define fts0b_scan_bytes_ALREADY_DEFINED
+#else
+#define yy_scan_bytes fts0b_scan_bytes
+#endif
+
+#ifdef yy_init_buffer
+#define fts0b_init_buffer_ALREADY_DEFINED
+#else
+#define yy_init_buffer fts0b_init_buffer
+#endif
+
+#ifdef yy_flush_buffer
+#define fts0b_flush_buffer_ALREADY_DEFINED
+#else
+#define yy_flush_buffer fts0b_flush_buffer
+#endif
+
+#ifdef yy_load_buffer_state
+#define fts0b_load_buffer_state_ALREADY_DEFINED
+#else
+#define yy_load_buffer_state fts0b_load_buffer_state
+#endif
+
+#ifdef yy_switch_to_buffer
+#define fts0b_switch_to_buffer_ALREADY_DEFINED
+#else
+#define yy_switch_to_buffer fts0b_switch_to_buffer
+#endif
+
+#ifdef yypush_buffer_state
+#define fts0bpush_buffer_state_ALREADY_DEFINED
+#else
+#define yypush_buffer_state fts0bpush_buffer_state
+#endif
+
+#ifdef yypop_buffer_state
+#define fts0bpop_buffer_state_ALREADY_DEFINED
+#else
+#define yypop_buffer_state fts0bpop_buffer_state
+#endif
+
+#ifdef yyensure_buffer_stack
+#define fts0bensure_buffer_stack_ALREADY_DEFINED
+#else
+#define yyensure_buffer_stack fts0bensure_buffer_stack
+#endif
+
+#ifdef yylex
+#define fts0blex_ALREADY_DEFINED
+#else
+#define yylex fts0blex
+#endif
+
+#ifdef yyrestart
+#define fts0brestart_ALREADY_DEFINED
+#else
+#define yyrestart fts0brestart
+#endif
+
+#ifdef yylex_init
+#define fts0blex_init_ALREADY_DEFINED
+#else
+#define yylex_init fts0blex_init
+#endif
+
+#ifdef yylex_init_extra
+#define fts0blex_init_extra_ALREADY_DEFINED
+#else
+#define yylex_init_extra fts0blex_init_extra
+#endif
+
+#ifdef yylex_destroy
+#define fts0blex_destroy_ALREADY_DEFINED
+#else
+#define yylex_destroy fts0blex_destroy
+#endif
+
+#ifdef yyget_debug
+#define fts0bget_debug_ALREADY_DEFINED
+#else
+#define yyget_debug fts0bget_debug
+#endif
+
+#ifdef yyset_debug
+#define fts0bset_debug_ALREADY_DEFINED
+#else
+#define yyset_debug fts0bset_debug
+#endif
+
+#ifdef yyget_extra
+#define fts0bget_extra_ALREADY_DEFINED
+#else
+#define yyget_extra fts0bget_extra
+#endif
+
+#ifdef yyset_extra
+#define fts0bset_extra_ALREADY_DEFINED
+#else
+#define yyset_extra fts0bset_extra
+#endif
+
+#ifdef yyget_in
+#define fts0bget_in_ALREADY_DEFINED
+#else
+#define yyget_in fts0bget_in
+#endif
+
+#ifdef yyset_in
+#define fts0bset_in_ALREADY_DEFINED
+#else
+#define yyset_in fts0bset_in
+#endif
+
+#ifdef yyget_out
+#define fts0bget_out_ALREADY_DEFINED
+#else
+#define yyget_out fts0bget_out
+#endif
+
+#ifdef yyset_out
+#define fts0bset_out_ALREADY_DEFINED
+#else
+#define yyset_out fts0bset_out
+#endif
+
+#ifdef yyget_leng
+#define fts0bget_leng_ALREADY_DEFINED
+#else
+#define yyget_leng fts0bget_leng
+#endif
+
+#ifdef yyget_text
+#define fts0bget_text_ALREADY_DEFINED
+#else
+#define yyget_text fts0bget_text
+#endif
+
+#ifdef yyget_lineno
+#define fts0bget_lineno_ALREADY_DEFINED
+#else
+#define yyget_lineno fts0bget_lineno
+#endif
+
+#ifdef yyset_lineno
+#define fts0bset_lineno_ALREADY_DEFINED
+#else
+#define yyset_lineno fts0bset_lineno
+#endif
+
+#ifdef yyget_column
+#define fts0bget_column_ALREADY_DEFINED
+#else
+#define yyget_column fts0bget_column
+#endif
+
+#ifdef yyset_column
+#define fts0bset_column_ALREADY_DEFINED
+#else
+#define yyset_column fts0bset_column
+#endif
+
+#ifdef yywrap
+#define fts0bwrap_ALREADY_DEFINED
+#else
+#define yywrap fts0bwrap
+#endif
+
+#ifdef yyalloc
+#define fts0balloc_ALREADY_DEFINED
+#else
+#define yyalloc fts0balloc
+#endif
+
+#ifdef yyrealloc
+#define fts0brealloc_ALREADY_DEFINED
+#else
+#define yyrealloc fts0brealloc
+#endif
+
+#ifdef yyfree
+#define fts0bfree_ALREADY_DEFINED
+#else
+#define yyfree fts0bfree
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types. 
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t; 
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#ifndef SIZE_MAX
+#define SIZE_MAX               (~(size_t)0)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+/* begin standard C++ headers. */
+
+/* TODO: this is always defined, so inline it */
+#define yyconst const
+
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define yynoreturn __attribute__((__noreturn__))
+#else
+#define yynoreturn
+#endif
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	int yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	int yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+void yyrestart ( FILE *input_file , yyscan_t yyscanner );
+void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size , yyscan_t yyscanner );
+void yy_delete_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yy_flush_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yypush_buffer_state ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+void yypop_buffer_state ( yyscan_t yyscanner );
+
+YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_string ( const char *yy_str , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len , yyscan_t yyscanner );
+
+void *yyalloc ( yy_size_t , yyscan_t yyscanner );
+void *yyrealloc ( void *, yy_size_t , yyscan_t yyscanner );
+void yyfree ( void * , yyscan_t yyscanner );
+
+/* Begin user sect3 */
+
+#define fts0bwrap(yyscanner) (/*CONSTCOND*/1)
+#define YY_SKIP_YYWRAP
+
+#define yytext_ptr yytext_r
+
+#ifdef YY_HEADER_EXPORT_START_CONDITIONS
+#define INITIAL 0
+
+#endif
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+int yylex_init (yyscan_t* scanner);
+
+int yylex_init_extra ( YY_EXTRA_TYPE user_defined, yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int yylex_destroy ( yyscan_t yyscanner );
+
+int yyget_debug ( yyscan_t yyscanner );
+
+void yyset_debug ( int debug_flag , yyscan_t yyscanner );
+
+YY_EXTRA_TYPE yyget_extra ( yyscan_t yyscanner );
+
+void yyset_extra ( YY_EXTRA_TYPE user_defined , yyscan_t yyscanner );
+
+FILE *yyget_in ( yyscan_t yyscanner );
+
+void yyset_in  ( FILE * _in_str , yyscan_t yyscanner );
+
+FILE *yyget_out ( yyscan_t yyscanner );
+
+void yyset_out  ( FILE * _out_str , yyscan_t yyscanner );
+
+			int yyget_leng ( yyscan_t yyscanner );
+
+char *yyget_text ( yyscan_t yyscanner );
+
+int yyget_lineno ( yyscan_t yyscanner );
+
+void yyset_lineno ( int _line_number , yyscan_t yyscanner );
+
+int yyget_column  ( yyscan_t yyscanner );
+
+void yyset_column ( int _column_no , yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int yywrap ( yyscan_t yyscanner );
+#else
+extern int yywrap ( yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy ( char *, const char *, int , yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen ( const char * , yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int yylex (yyscan_t yyscanner);
+
+#define YY_DECL int yylex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+#undef YY_NEW_FILE
+#undef YY_FLUSH_BUFFER
+#undef yy_set_bol
+#undef yy_new_buffer
+#undef yy_set_interactive
+#undef YY_DO_BEFORE_ACTION
+
+#ifdef YY_DECL_IS_OURS
+#undef YY_DECL_IS_OURS
+#undef YY_DECL
+#endif
+
+#ifndef fts0b_create_buffer_ALREADY_DEFINED
+#undef yy_create_buffer
+#endif
+#ifndef fts0b_delete_buffer_ALREADY_DEFINED
+#undef yy_delete_buffer
+#endif
+#ifndef fts0b_scan_buffer_ALREADY_DEFINED
+#undef yy_scan_buffer
+#endif
+#ifndef fts0b_scan_string_ALREADY_DEFINED
+#undef yy_scan_string
+#endif
+#ifndef fts0b_scan_bytes_ALREADY_DEFINED
+#undef yy_scan_bytes
+#endif
+#ifndef fts0b_init_buffer_ALREADY_DEFINED
+#undef yy_init_buffer
+#endif
+#ifndef fts0b_flush_buffer_ALREADY_DEFINED
+#undef yy_flush_buffer
+#endif
+#ifndef fts0b_load_buffer_state_ALREADY_DEFINED
+#undef yy_load_buffer_state
+#endif
+#ifndef fts0b_switch_to_buffer_ALREADY_DEFINED
+#undef yy_switch_to_buffer
+#endif
+#ifndef fts0bpush_buffer_state_ALREADY_DEFINED
+#undef yypush_buffer_state
+#endif
+#ifndef fts0bpop_buffer_state_ALREADY_DEFINED
+#undef yypop_buffer_state
+#endif
+#ifndef fts0bensure_buffer_stack_ALREADY_DEFINED
+#undef yyensure_buffer_stack
+#endif
+#ifndef fts0blex_ALREADY_DEFINED
+#undef yylex
+#endif
+#ifndef fts0brestart_ALREADY_DEFINED
+#undef yyrestart
+#endif
+#ifndef fts0blex_init_ALREADY_DEFINED
+#undef yylex_init
+#endif
+#ifndef fts0blex_init_extra_ALREADY_DEFINED
+#undef yylex_init_extra
+#endif
+#ifndef fts0blex_destroy_ALREADY_DEFINED
+#undef yylex_destroy
+#endif
+#ifndef fts0bget_debug_ALREADY_DEFINED
+#undef yyget_debug
+#endif
+#ifndef fts0bset_debug_ALREADY_DEFINED
+#undef yyset_debug
+#endif
+#ifndef fts0bget_extra_ALREADY_DEFINED
+#undef yyget_extra
+#endif
+#ifndef fts0bset_extra_ALREADY_DEFINED
+#undef yyset_extra
+#endif
+#ifndef fts0bget_in_ALREADY_DEFINED
+#undef yyget_in
+#endif
+#ifndef fts0bset_in_ALREADY_DEFINED
+#undef yyset_in
+#endif
+#ifndef fts0bget_out_ALREADY_DEFINED
+#undef yyget_out
+#endif
+#ifndef fts0bset_out_ALREADY_DEFINED
+#undef yyset_out
+#endif
+#ifndef fts0bget_leng_ALREADY_DEFINED
+#undef yyget_leng
+#endif
+#ifndef fts0bget_text_ALREADY_DEFINED
+#undef yyget_text
+#endif
+#ifndef fts0bget_lineno_ALREADY_DEFINED
+#undef yyget_lineno
+#endif
+#ifndef fts0bset_lineno_ALREADY_DEFINED
+#undef yyset_lineno
+#endif
+#ifndef fts0bget_column_ALREADY_DEFINED
+#undef yyget_column
+#endif
+#ifndef fts0bset_column_ALREADY_DEFINED
+#undef yyset_column
+#endif
+#ifndef fts0bwrap_ALREADY_DEFINED
+#undef yywrap
+#endif
+#ifndef fts0bget_lval_ALREADY_DEFINED
+#undef yyget_lval
+#endif
+#ifndef fts0bset_lval_ALREADY_DEFINED
+#undef yyset_lval
+#endif
+#ifndef fts0bget_lloc_ALREADY_DEFINED
+#undef yyget_lloc
+#endif
+#ifndef fts0bset_lloc_ALREADY_DEFINED
+#undef yyset_lloc
+#endif
+#ifndef fts0balloc_ALREADY_DEFINED
+#undef yyalloc
+#endif
+#ifndef fts0brealloc_ALREADY_DEFINED
+#undef yyrealloc
+#endif
+#ifndef fts0bfree_ALREADY_DEFINED
+#undef yyfree
+#endif
+#ifndef fts0btext_ALREADY_DEFINED
+#undef yytext
+#endif
+#ifndef fts0bleng_ALREADY_DEFINED
+#undef yyleng
+#endif
+#ifndef fts0bin_ALREADY_DEFINED
+#undef yyin
+#endif
+#ifndef fts0bout_ALREADY_DEFINED
+#undef yyout
+#endif
+#ifndef fts0b_flex_debug_ALREADY_DEFINED
+#undef yy_flex_debug
+#endif
+#ifndef fts0blineno_ALREADY_DEFINED
+#undef yylineno
+#endif
+#ifndef fts0btables_fload_ALREADY_DEFINED
+#undef yytables_fload
+#endif
+#ifndef fts0btables_destroy_ALREADY_DEFINED
+#undef yytables_destroy
+#endif
+#ifndef fts0bTABLES_NAME_ALREADY_DEFINED
+#undef yyTABLES_NAME
+#endif
+
+#line 74 "fts0blex.l"
+
+
+#line 701 "../include/fts0blex.h"
+#undef fts0bIN_HEADER
+#endif /* fts0bHEADER_H */
diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h
new file mode 100644
index 00000000..c0151b44
--- /dev/null
+++ b/storage/innobase/include/fts0fts.h
@@ -0,0 +1,947 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0fts.h
+Full text search header file
+
+Created 2011/09/02 Sunny Bains
+***********************************************************************/
+
+#pragma once
+
+#include "data0type.h"
+#include "data0types.h"
+#include "mem0mem.h"
+#include "rem0types.h"
+#include "row0types.h"
+#include "trx0types.h"
+#include "ut0vec.h"
+#include "ut0rbt.h"
+#include "ut0wqueue.h"
+#include "que0types.h"
+#include "ft_global.h"
+#include "mysql/plugin_ftparser.h"
+
+/** "NULL" value of a document id. */
+#define FTS_NULL_DOC_ID			0
+
+/** FTS hidden column that is used to map to and from the row */
+#define FTS_DOC_ID_COL_NAME		"FTS_DOC_ID"
+
+/** The name of the index created by FTS */
+#define FTS_DOC_ID_INDEX_NAME		"FTS_DOC_ID_INDEX"
+
+#define FTS_DOC_ID_INDEX_NAME_LEN	16
+
+/** Doc ID is a 8 byte value */
+#define FTS_DOC_ID_LEN			8
+
+/** The number of fields to sort when we build FT index with
+FIC. Three fields are sort: (word, doc_id, position) */
+#define FTS_NUM_FIELDS_SORT		3
+
+/** Maximum number of rows in a table, smaller than which, we will
+optimize using a 4 byte Doc ID for FIC merge sort to reduce sort size */
+#define MAX_DOC_ID_OPT_VAL		1073741824
+
+/** Document id type. */
+typedef ib_id_t doc_id_t;
+
+/** doc_id_t printf format */
+#define FTS_DOC_ID_FORMAT	IB_ID_FMT
+
+/** Convert document id to the InnoDB (BIG ENDIAN) storage format. */
+#define fts_write_doc_id(d, s)	mach_write_to_8(d, s)
+
+/** Read a document id to internal format. */
+#define fts_read_doc_id(s)	mach_read_from_8(s)
+
+/** Bind the doc id to a variable */
+#define fts_bind_doc_id(i, n, v) pars_info_bind_int8_literal(i, n, v)
+
+/** Defines for FTS query mode, they have the same values as
+those defined in mysql file ft_global.h */
+#define FTS_NL		0
+#define FTS_BOOL	1
+#define FTS_SORTED	2
+#define FTS_EXPAND	4
+#define FTS_NO_RANKING	8
+#define FTS_PROXIMITY	16
+#define FTS_PHRASE	32
+#define FTS_OPT_RANKING	64
+
+#define FTS_INDEX_TABLE_IND_NAME	"FTS_INDEX_TABLE_IND"
+
+/** The number of FTS index partitions for a fulltext idnex */
+#define FTS_NUM_AUX_INDEX		6
+
+/** Threshold where our optimize thread automatically kicks in */
+#define FTS_OPTIMIZE_THRESHOLD		10000000
+
+/** Maximum possible Fulltext word length in bytes (assuming mbmaxlen=4) */
+#define FTS_MAX_WORD_LEN		(HA_FT_MAXCHARLEN * 4)
+
+/** Maximum possible Fulltext word length (in characters) */
+#define FTS_MAX_WORD_LEN_IN_CHAR	HA_FT_MAXCHARLEN
+
+/** Number of columns in FTS AUX Tables */
+#define FTS_DELETED_TABLE_NUM_COLS	1
+#define FTS_CONFIG_TABLE_NUM_COLS	2
+#define FTS_AUX_INDEX_TABLE_NUM_COLS	5
+
+/** DELETED_TABLE(doc_id BIGINT UNSIGNED) */
+#define FTS_DELETED_TABLE_COL_LEN	8
+/** CONFIG_TABLE(key CHAR(50), value CHAR(200)) */
+#define FTS_CONFIG_TABLE_KEY_COL_LEN	50
+#define FTS_CONFIG_TABLE_VALUE_COL_LEN	200
+
+#define FTS_INDEX_FIRST_DOC_ID_LEN	8
+#define FTS_INDEX_LAST_DOC_ID_LEN	8
+#define FTS_INDEX_DOC_COUNT_LEN		4
+/* BLOB COLUMN, 0 means VARIABLE SIZE */
+#define FTS_INDEX_ILIST_LEN		0
+
+
+/** Variable specifying the FTS parallel sort degree */
+extern ulong		fts_sort_pll_degree;
+
+/** Variable specifying the number of word to optimize for each optimize table
+call */
+extern ulong		fts_num_word_optimize;
+
+/** Variable specifying whether we do additional FTS diagnostic printout
+in the log */
+extern char		fts_enable_diag_print;
+
+/** FTS rank type, which will be between 0 .. 1 inclusive */
+typedef float 		fts_rank_t;
+
+/** Type of a row during a transaction. FTS_NOTHING means the row can be
+forgotten from the FTS system's POV, FTS_INVALID is an internal value used
+to mark invalid states.
+
+NOTE: Do not change the order or value of these, fts_trx_row_get_new_state
+depends on them being exactly as they are. */
+enum fts_row_state {
+	FTS_INSERT = 0,
+	FTS_MODIFY,
+	FTS_DELETE,
+	FTS_NOTHING,
+	FTS_INVALID
+};
+
+/** The FTS table types. */
+enum fts_table_type_t {
+	FTS_INDEX_TABLE,		/*!< FTS auxiliary table that is
+					specific to a particular FTS index
+					on a table */
+
+	FTS_COMMON_TABLE		/*!< FTS auxiliary table that is common
+					for all FTS index on a table */
+};
+
+struct fts_doc_t;
+struct fts_cache_t;
+struct fts_token_t;
+struct fts_doc_ids_t;
+struct fts_index_cache_t;
+
+
+/** Initialize the "fts_table" for internal query into FTS auxiliary
+tables */
+#define FTS_INIT_FTS_TABLE(fts_table, m_suffix, m_type, m_table)\
+do {								\
+	(fts_table)->suffix = m_suffix;				\
+        (fts_table)->type = m_type;				\
+        (fts_table)->table_id = m_table->id;			\
+        (fts_table)->table = m_table;				\
+} while (0);
+
+#define FTS_INIT_INDEX_TABLE(fts_table, m_suffix, m_type, m_index)\
+do {								\
+	(fts_table)->suffix = m_suffix;				\
+        (fts_table)->type = m_type;				\
+        (fts_table)->table_id = m_index->table->id;		\
+        (fts_table)->table = m_index->table;			\
+        (fts_table)->index_id = m_index->id;			\
+} while (0);
+
+/** Information about changes in a single transaction affecting
+the FTS system. */
+struct fts_trx_t {
+	trx_t*		trx;		/*!< InnoDB transaction */
+
+	ib_vector_t*	savepoints;	/*!< Active savepoints, must have at
+					least one element, the implied
+					savepoint */
+	ib_vector_t*	last_stmt;	/*!< last_stmt */
+
+	mem_heap_t*	heap;		/*!< heap */
+};
+
+/** Information required for transaction savepoint handling. */
+struct fts_savepoint_t {
+	char*		name;		/*!< First entry is always NULL, the
+					default instance. Otherwise the name
+					of the savepoint */
+
+	ib_rbt_t*	tables;		/*!< Modified FTS tables */
+};
+
+/** Information about changed rows in a transaction for a single table. */
+struct fts_trx_table_t {
+	dict_table_t*	table;		/*!< table */
+
+	fts_trx_t*	fts_trx;	/*!< link to parent */
+
+	ib_rbt_t*	rows;		/*!< rows changed; indexed by doc-id,
+					cells are fts_trx_row_t* */
+
+	fts_doc_ids_t*	added_doc_ids;	/*!< list of added doc ids (NULL until
+					the first addition) */
+
+					/*!< for adding doc ids */
+	que_t*		docs_added_graph;
+};
+
+/** Information about one changed row in a transaction. */
+struct fts_trx_row_t {
+	doc_id_t	doc_id;		/*!< Id of the ins/upd/del document */
+
+	fts_row_state	state;		/*!< state of the row */
+
+	ib_vector_t*	fts_indexes;	/*!< The indexes that are affected */
+};
+
+/** List of document ids that were added during a transaction. This
+list is passed on to a background 'Add' thread and OPTIMIZE, so it
+needs its own memory heap. */
+struct fts_doc_ids_t {
+	ib_vector_t*	doc_ids;	/*!< document ids (each element is
+					of type doc_id_t). */
+
+	ib_alloc_t*	self_heap;	/*!< Allocator used to create an
+					instance of this type and the
+					doc_ids vector */
+};
+
+// FIXME: Get rid of this if possible.
+/** Since MySQL's character set support for Unicode is woefully inadequate
+(it supports basic operations like isalpha etc. only for 8-bit characters),
+we have to implement our own. We use UTF-16 without surrogate processing
+as our in-memory format. This typedef is a single such character. */
+typedef unsigned short ib_uc_t;
+
+/** An UTF-16 ro UTF-8 string. */
+struct fts_string_t {
+	byte*		f_str;		/*!< string, not necessary terminated in
+					any way */
+	ulint		f_len;		/*!< Length of the string in bytes */
+	ulint		f_n_char;	/*!< Number of characters */
+};
+
+/** Query ranked doc ids. */
+struct fts_ranking_t {
+	doc_id_t	doc_id;		/*!< Document id */
+
+	fts_rank_t	rank;		/*!< Rank is between 0 .. 1 */
+
+	byte*		words;		/*!< this contains the words
+					that were queried
+					and found in this document */
+	ulint		words_len;	/*!< words len */
+};
+
+/** Query result. */
+struct fts_result_t {
+	ib_rbt_node_t*	current;	/*!< Current element */
+
+	ib_rbt_t*	rankings_by_id;	/*!< RB tree of type fts_ranking_t
+					indexed by doc id */
+	ib_rbt_t*	rankings_by_rank;/*!< RB tree of type fts_ranking_t
+					indexed by rank */
+};
+
+/** This is used to generate the FTS auxiliary table name, we need the
+table id and the index id to generate the column specific FTS auxiliary
+table name. */
+struct fts_table_t {
+	fts_table_type_t
+			type;		/*!< The auxiliary table type */
+
+	table_id_t	table_id;	/*!< The table id */
+
+	index_id_t	index_id;	/*!< The index id */
+
+	const char*	suffix;		/*!< The suffix of the fts auxiliary
+					table name, can be NULL, not used
+					everywhere (yet) */
+	const dict_table_t*
+			table;		/*!< Parent table */
+	CHARSET_INFO*	charset;	/*!< charset info if it is for FTS
+					index auxiliary table */
+};
+
+/** The state of the FTS sub system. */
+class fts_t {
+public:
+	/** fts_t constructor.
+	@param[in]	table	table with FTS indexes
+	@param[in,out]	heap	memory heap where 'this' is stored */
+	fts_t(
+		const dict_table_t*	table,
+		mem_heap_t*		heap);
+
+	/** fts_t destructor. */
+	~fts_t();
+
+	/** Whether the ADDED table record sync-ed after crash recovery */
+	unsigned	added_synced:1;
+	/** Whether the table holds dict_sys.latch */
+	unsigned	dict_locked:1;
+
+	/** Work queue for scheduling jobs for the FTS 'Add' thread, or NULL
+	if the thread has not yet been created. Each work item is a
+	fts_trx_doc_ids_t*. */
+	ib_wqueue_t*	add_wq;
+
+	/** FTS memory buffer for this table, or NULL if the table has no FTS
+	index. */
+	fts_cache_t*	cache;
+
+	/** FTS doc id hidden column number in the CLUSTERED index. */
+	ulint		doc_col;
+
+	/** Vector of FTS indexes, this is mainly for caching purposes. */
+	ib_vector_t*	indexes;
+
+	/** Whether the table exists in fts_optimize_wq;
+	protected by fts_optimize_wq mutex */
+	bool		in_queue;
+
+	/** Whether the sync message exists in fts_optimize_wq;
+	protected by fts_optimize_wq mutex */
+	bool		sync_message;
+
+	/** Heap for fts_t allocation. */
+	mem_heap_t*	fts_heap;
+};
+
+struct fts_stopword_t;
+
+/** status bits for fts_stopword_t status field. */
+#define STOPWORD_NOT_INIT               0x1
+#define STOPWORD_OFF                    0x2
+#define STOPWORD_FROM_DEFAULT           0x4
+#define STOPWORD_USER_TABLE             0x8
+
+extern const char*	fts_default_stopword[];
+
+/** Variable specifying the maximum FTS cache size for each table */
+extern Atomic_relaxed<size_t> fts_max_cache_size;
+
+/** Variable specifying the total memory allocated for FTS cache */
+extern Atomic_relaxed<size_t> fts_max_total_cache_size;
+
+/** Variable specifying the FTS result cache limit for each query */
+extern size_t		fts_result_cache_limit;
+
+/** Variable specifying the maximum FTS max token size */
+extern ulong		fts_max_token_size;
+
+/** Variable specifying the minimum FTS max token size */
+extern ulong		fts_min_token_size;
+
+/** Whether the total memory used for FTS cache is exhausted, and we will
+need a sync to free some memory */
+extern bool		fts_need_sync;
+
+/******************************************************************//**
+Create a FTS cache. */
+fts_cache_t*
+fts_cache_create(
+/*=============*/
+	dict_table_t*	table);			/*!< table owns the FTS cache */
+
+/******************************************************************//**
+Create a FTS index cache.
+@return Index Cache */
+fts_index_cache_t*
+fts_cache_index_cache_create(
+/*=========================*/
+	dict_table_t*	table,			/*!< in: table with FTS index */
+	dict_index_t*	index);			/*!< in: FTS index */
+
+/******************************************************************//**
+Get the next available document id. This function creates a new
+transaction to generate the document id.
+@return DB_SUCCESS if OK */
+dberr_t
+fts_get_next_doc_id(
+/*================*/
+	const dict_table_t*	table,	/*!< in: table */
+	doc_id_t*		doc_id);/*!< out: new document id */
+
+/******************************************************************//**
+Create a new fts_doc_ids_t.
+@return new fts_doc_ids_t. */
+fts_doc_ids_t*
+fts_doc_ids_create(void);
+/*=====================*/
+
+/** Free fts_doc_ids_t */
+inline void fts_doc_ids_free(fts_doc_ids_t* doc_ids)
+{
+	mem_heap_free(static_cast<mem_heap_t*>(doc_ids->self_heap->arg));
+}
+
+/******************************************************************//**
+Notify the FTS system about an operation on an FTS-indexed table. */
+void
+fts_trx_add_op(
+/*===========*/
+	trx_t*		trx,			/*!< in: InnoDB transaction */
+	dict_table_t*	table,			/*!< in: table */
+	doc_id_t	doc_id,			/*!< in: doc id */
+	fts_row_state	state,			/*!< in: state of the row */
+	ib_vector_t*	fts_indexes);		/*!< in: FTS indexes affected
+						(NULL=all) */
+
+/******************************************************************//**
+Free an FTS trx. */
+void
+fts_trx_free(
+/*=========*/
+	fts_trx_t*	fts_trx);		/*!< in, own: FTS trx */
+
+/** Creates the common auxiliary tables needed for supporting an FTS index
+on the given table.
+The following tables are created.
+CREATE TABLE $FTS_PREFIX_DELETED
+	(doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_DELETED_CACHE
+	(doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_BEING_DELETED
+	(doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_BEING_DELETED_CACHE
+	(doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_CONFIG
+	(key CHAR(50), value CHAR(200), UNIQUE CLUSTERED INDEX on key)
+@param[in,out]	trx			transaction
+@param[in]	table			table with FTS index
+@param[in]	skip_doc_id_index	Skip index on doc id
+@return DB_SUCCESS if succeed */
+dberr_t
+fts_create_common_tables(
+	trx_t*		trx,
+	dict_table_t*	table,
+	bool		skip_doc_id_index)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Creates the column specific ancillary tables needed for supporting an
+FTS index on the given table.
+
+All FTS AUX Index tables have the following schema.
+CREAT TABLE $FTS_PREFIX_INDEX_[1-6](
+	word		VARCHAR(FTS_MAX_WORD_LEN),
+	first_doc_id	INT NOT NULL,
+	last_doc_id	UNSIGNED NOT NULL,
+	doc_count	UNSIGNED INT NOT NULL,
+	ilist		VARBINARY NOT NULL,
+	UNIQUE CLUSTERED INDEX ON (word, first_doc_id))
+@param[in,out]	trx	dictionary transaction
+@param[in]	index	fulltext index
+@param[in]	id	table id
+@return DB_SUCCESS or error code */
+dberr_t
+fts_create_index_tables(trx_t* trx, const dict_index_t* index, table_id_t id)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Add the FTS document id hidden column. */
+void
+fts_add_doc_id_column(
+/*==================*/
+	dict_table_t*	table,	/*!< in/out: Table with FTS index */
+	mem_heap_t*	heap);	/*!< in: temporary memory heap, or NULL */
+
+/** Lock the internal FTS_ tables for an index, before fts_drop_index_tables().
+@param trx   transaction
+@param index fulltext index */
+dberr_t fts_lock_index_tables(trx_t *trx, const dict_index_t &index);
+
+/** Lock the internal common FTS_ tables, before fts_drop_common_tables().
+@param trx    transaction
+@param table  table containing FULLTEXT INDEX
+@return DB_SUCCESS or error code */
+dberr_t fts_lock_common_tables(trx_t *trx, const dict_table_t &table);
+
+/** Lock the internal FTS_ tables for table, before fts_drop_tables().
+@param trx    transaction
+@param table  table containing FULLTEXT INDEX
+@return DB_SUCCESS or error code */
+dberr_t fts_lock_tables(trx_t *trx, const dict_table_t &table);
+
+/** Drop the internal FTS_ tables for table.
+@param trx    transaction
+@param table  table containing FULLTEXT INDEX
+@return DB_SUCCESS or error code */
+dberr_t fts_drop_tables(trx_t *trx, const dict_table_t &table);
+
+/******************************************************************//**
+The given transaction is about to be committed; do whatever is necessary
+from the FTS system's POV.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_commit(
+/*=======*/
+	trx_t*		trx)			/*!< in: transaction */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** FTS Query entry point.
+@param[in,out]	trx		transaction
+@param[in]	index		fts index to search
+@param[in]	flags		FTS search mode
+@param[in]	query_str	FTS query
+@param[in]	query_len	FTS query string len in bytes
+@param[in,out]	result		result doc ids
+@return DB_SUCCESS if successful otherwise error code */
+dberr_t
+fts_query(
+	trx_t*		trx,
+	dict_index_t*	index,
+	uint		flags,
+	const byte*	query_str,
+	ulint		query_len,
+	fts_result_t**	result)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************************//**
+Retrieve the FTS Relevance Ranking result for doc with doc_id
+@return the relevance ranking value. */
+float
+fts_retrieve_ranking(
+/*=================*/
+	fts_result_t*	result,			/*!< in: FTS result structure */
+	doc_id_t	doc_id);		/*!< in: the interested document
+						doc_id */
+
+/******************************************************************//**
+FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
+void
+fts_query_sort_result_on_rank(
+/*==========================*/
+	fts_result_t*	result);		/*!< out: result instance
+						to sort.*/
+
+/******************************************************************//**
+FTS Query free result, returned by fts_query(). */
+void
+fts_query_free_result(
+/*==================*/
+	fts_result_t*	result);		/*!< in: result instance
+						to free.*/
+
+/******************************************************************//**
+Extract the doc id from the FTS hidden column. */
+doc_id_t
+fts_get_doc_id_from_row(
+/*====================*/
+	dict_table_t*	table,			/*!< in: table */
+	dtuple_t*	row);			/*!< in: row whose FTS doc id we
+						want to extract.*/
+
+/** Extract the doc id from the record that belongs to index.
+@param[in]	rec	record containing FTS_DOC_ID
+@param[in]	index	index of rec
+@param[in]	offsets	rec_get_offsets(rec,index)
+@return doc id that was extracted from rec */
+doc_id_t
+fts_get_doc_id_from_rec(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	const rec_offs*		offsets);
+
+/** Add new fts doc id to the update vector.
+@param[in]	table		the table that contains the FTS index.
+@param[in,out]	ufield		the fts doc id field in the update vector.
+				No new memory is allocated for this in this
+				function.
+@param[in,out]	next_doc_id	the fts doc id that has been added to the
+				update vector.  If 0, a new fts doc id is
+				automatically generated.  The memory provided
+				for this argument will be used by the update
+				vector. Ensure that the life time of this
+				memory matches that of the update vector.
+@return the fts doc id used in the update vector */
+doc_id_t
+fts_update_doc_id(
+	dict_table_t*	table,
+	upd_field_t*	ufield,
+	doc_id_t*	next_doc_id);
+
+/******************************************************************//**
+FTS initialize. */
+void
+fts_startup(void);
+/*==============*/
+
+/******************************************************************//**
+Create an instance of fts_t.
+@return instance of fts_t */
+fts_t*
+fts_create(
+/*=======*/
+	dict_table_t*	table);			/*!< out: table with FTS
+						indexes */
+
+/*********************************************************************//**
+Run OPTIMIZE on the given table.
+@return DB_SUCCESS if all OK */
+dberr_t
+fts_optimize_table(
+/*===============*/
+	dict_table_t*	table);			/*!< in: table to optimiza */
+
+/**********************************************************************//**
+Startup the optimize thread and create the work queue. */
+void
+fts_optimize_init(void);
+/*====================*/
+
+/****************************************************************//**
+Drops index ancillary tables for a FTS index
+@return DB_SUCCESS or error code */
+dberr_t fts_drop_index_tables(trx_t *trx, const dict_index_t &index)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Add the table to add to the OPTIMIZER's list.
+@param[in]	table	table to add */
+void
+fts_optimize_add_table(
+	dict_table_t*	table);
+
+/******************************************************************//**
+Remove the table from the OPTIMIZER's list. We do wait for
+acknowledgement from the consumer of the message. */
+void
+fts_optimize_remove_table(
+/*======================*/
+	dict_table_t*	table);			/*!< in: table to remove */
+
+/** Shutdown fts optimize thread. */
+void
+fts_optimize_shutdown();
+
+/** Send sync fts cache for the table.
+@param[in]	table	table to sync */
+void
+fts_optimize_request_sync_table(
+	dict_table_t*	table);
+
+/**********************************************************************//**
+Take a FTS savepoint. */
+void
+fts_savepoint_take(
+/*===============*/
+	fts_trx_t*	fts_trx,		/*!< in: fts transaction */
+	const char*	name);			/*!< in: savepoint name */
+
+/**********************************************************************//**
+Refresh last statement savepoint. */
+void
+fts_savepoint_laststmt_refresh(
+/*===========================*/
+	trx_t*		trx);			/*!< in: transaction */
+
+/**********************************************************************//**
+Release the savepoint data identified by  name. */
+void
+fts_savepoint_release(
+/*==================*/
+	trx_t*		trx,			/*!< in: transaction */
+	const char*	name);			/*!< in: savepoint name */
+
+/** Clear cache.
+@param[in,out]	cache	fts cache */
+void
+fts_cache_clear(
+	fts_cache_t*	cache);
+
+/*********************************************************************//**
+Initialize things in cache. */
+void
+fts_cache_init(
+/*===========*/
+	fts_cache_t*	cache);			/*!< in: cache */
+
+/*********************************************************************//**
+Rollback to and including savepoint indentified by name. */
+void
+fts_savepoint_rollback(
+/*===================*/
+	trx_t*		trx,			/*!< in: transaction */
+	const char*	name);			/*!< in: savepoint name */
+
+/*********************************************************************//**
+Rollback to and including savepoint indentified by name. */
+void
+fts_savepoint_rollback_last_stmt(
+/*=============================*/
+	trx_t*		trx);			/*!< in: transaction */
+
+/** Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end.
+@param[in,out]	table		fts table
+@param[in]	wait		whether to wait for existing sync to finish
+@return DB_SUCCESS on success, error code on failure. */
+dberr_t fts_sync_table(dict_table_t* table, bool wait = true);
+
+/****************************************************************//**
+Create an FTS index cache. */
+CHARSET_INFO*
+fts_index_get_charset(
+/*==================*/
+	dict_index_t*		index);		/*!< in: FTS index */
+
+/*********************************************************************//**
+Get the initial Doc ID by consulting the CONFIG table
+@return initial Doc ID */
+doc_id_t
+fts_init_doc_id(
+/*============*/
+	const dict_table_t*		table);	/*!< in: table */
+
+/******************************************************************//**
+compare two character string according to their charset. */
+extern
+int
+innobase_fts_text_cmp(
+/*==================*/
+	const void*	cs,			/*!< in: Character set */
+	const void*	p1,			/*!< in: key */
+	const void*	p2);			/*!< in: node */
+
+/******************************************************************//**
+Makes all characters in a string lower case. */
+extern
+size_t
+innobase_fts_casedn_str(
+/*====================*/
+        CHARSET_INFO*	cs,			/*!< in: Character set */
+	char*		src,			/*!< in: string to put in
+						lower case */
+	size_t		src_len,		/*!< in: input string length */
+	char*		dst,			/*!< in: buffer for result
+						string */
+	size_t		dst_len);		/*!< in: buffer size */
+
+
+/******************************************************************//**
+compare two character string according to their charset. */
+extern
+int
+innobase_fts_text_cmp_prefix(
+/*=========================*/
+	const void*	cs,			/*!< in: Character set */
+	const void*	p1,			/*!< in: key */
+	const void*	p2);			/*!< in: node */
+
+/*************************************************************//**
+Get the next token from the given string and store it in *token. */
+extern
+ulint
+innobase_mysql_fts_get_token(
+/*=========================*/
+	CHARSET_INFO*	charset,		/*!< in: Character set */
+	const byte*	start,			/*!< in: start of text */
+	const byte*	end,			/*!< in: one character past
+						end of text */
+	fts_string_t*	token);			/*!< out: token's text */
+
+/*************************************************************//**
+Get token char size by charset
+@return the number of token char size */
+ulint
+fts_get_token_size(
+/*===============*/
+	const CHARSET_INFO*	cs,		/*!< in: Character set */
+	const char*		token,		/*!< in: token */
+	ulint			len);		/*!< in: token length */
+
+/*************************************************************//**
+FULLTEXT tokenizer internal in MYSQL_FTPARSER_SIMPLE_MODE
+@return 0 if tokenize sucessfully */
+int
+fts_tokenize_document_internal(
+/*===========================*/
+	MYSQL_FTPARSER_PARAM*	param,	/*!< in: parser parameter */
+	const char*			doc,	/*!< in: document to tokenize */
+	int			len);	/*!< in: document length */
+
+/*********************************************************************//**
+Fetch COUNT(*) from specified table.
+@return the number of rows in the table */
+ulint
+fts_get_rows_count(
+/*===============*/
+	fts_table_t*	fts_table);		/*!< in: fts table to read */
+
+/*************************************************************//**
+Get maximum Doc ID in a table if index "FTS_DOC_ID_INDEX" exists
+@return max Doc ID or 0 if index "FTS_DOC_ID_INDEX" does not exist */
+doc_id_t
+fts_get_max_doc_id(
+/*===============*/
+	dict_table_t*	table);			/*!< in: user table */
+
+/** Check whether a stopword table is in the right format.
+@param stopword_table_name   table name
+@param row_end   name of the system-versioning end column, or "value"
+@return the stopword column charset
+@retval NULL if the table does not exist or qualify */
+CHARSET_INFO *fts_valid_stopword_table(const char *stopword_table_name,
+                                       const char **row_end= NULL);
+
+/****************************************************************//**
+This function loads specified stopword into FTS cache
+@return true if success */
+bool
+fts_load_stopword(
+/*==============*/
+	const dict_table_t*
+			table,			/*!< in: Table with FTS */
+	trx_t*		trx,			/*!< in: Transaction */
+	const char*	session_stopword_table,	/*!< in: Session stopword table
+						name */
+	bool		stopword_is_on,		/*!< in: Whether stopword
+						option is turned on/off */
+	bool		reload);		/*!< in: Whether it is during
+						reload of FTS table */
+
+/****************************************************************//**
+Read the rows from the FTS index
+@return DB_SUCCESS if OK */
+dberr_t
+fts_table_fetch_doc_ids(
+/*====================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table,		/*!< in: aux table */
+	fts_doc_ids_t*	doc_ids);		/*!< in: For collecting
+						doc ids */
+/****************************************************************//**
+This function brings FTS index in sync when FTS index is first
+used. There are documents that have not yet sync-ed to auxiliary
+tables from last server abnormally shutdown, we will need to bring
+such document into FTS cache before any further operations */
+void
+fts_init_index(
+/*===========*/
+	dict_table_t*	table,			/*!< in: Table with FTS */
+	bool		has_cache_lock);	/*!< in: Whether we already
+						have cache lock */
+/*******************************************************************//**
+Add a newly create index in FTS cache */
+void
+fts_add_index(
+/*==========*/
+	dict_index_t*	index,			/*!< FTS index to be added */
+	dict_table_t*	table);			/*!< table */
+
+/*******************************************************************//**
+Drop auxiliary tables related to an FTS index
+@return DB_SUCCESS or error number */
+dberr_t
+fts_drop_index(
+/*===========*/
+	dict_table_t*	table,	/*!< in: Table where indexes are dropped */
+	dict_index_t*	index,	/*!< in: Index to be dropped */
+	trx_t*		trx);	/*!< in: Transaction for the drop */
+
+/****************************************************************//**
+Rename auxiliary tables for all fts index for a table
+@return DB_SUCCESS or error code */
+dberr_t
+fts_rename_aux_tables(
+/*==================*/
+	dict_table_t*	table,		/*!< in: user Table */
+	const char*	new_name,	/*!< in: new table name */
+	trx_t*		trx);		/*!< in: transaction */
+
+/*******************************************************************//**
+Check indexes in the fts->indexes is also present in index cache and
+table->indexes list
+@return TRUE if all indexes match */
+ibool
+fts_check_cached_index(
+/*===================*/
+	dict_table_t*	table);  /*!< in: Table where indexes are dropped */
+
+/** Fetch the document from tuple, tokenize the text data and
+insert the text data into fts auxiliary table and
+its cache. Moreover this tuple fields doesn't contain any information
+about externally stored field. This tuple contains data directly
+converted from mysql.
+@param[in]     ftt     FTS transaction table
+@param[in]     doc_id  doc id
+@param[in]     tuple   tuple from where data can be retrieved
+                       and tuple should be arranged in table
+                       schema order. */
+void
+fts_add_doc_from_tuple(
+	fts_trx_table_t*ftt,
+	doc_id_t        doc_id,
+	const dtuple_t* tuple);
+
+/** Create an FTS trx.
+@param[in,out] trx     InnoDB Transaction
+@return FTS transaction. */
+fts_trx_t*
+fts_trx_create(
+	trx_t*  trx);
+
+/** Clear all fts resources when there is no internal DOC_ID
+and there are no new fts index to add.
+@param[in,out]  table   table  where fts is to be freed */
+void fts_clear_all(dict_table_t *table);
+
+/** Check whether the given name is fts auxiliary table
+and fetch the parent table id and index id
+@param[in]	name		table name
+@param[in,out]	table_id	parent table id
+@param[in,out]	index_id	index id
+@return true if it is auxilary table */
+bool fts_check_aux_table(const char *name,
+                         table_id_t *table_id,
+                         index_id_t *index_id);
+
+/** Update the last document id. This function could create a new
+transaction to update the last document id.
+@param	table	table to be updated
+@param	doc_id	last document id
+@param	trx	update trx or null
+@retval DB_SUCCESS if OK */
+dberr_t
+fts_update_sync_doc_id(const dict_table_t *table,
+		       doc_id_t  doc_id,
+		       trx_t *trx)
+	MY_ATTRIBUTE((nonnull(1)));
+
+/** Sync the table during commit phase
+@param[in]	table	table to be synced */
+void fts_sync_during_ddl(dict_table_t* table);
diff --git a/storage/innobase/include/fts0opt.h b/storage/innobase/include/fts0opt.h
new file mode 100644
index 00000000..c527ad8e
--- /dev/null
+++ b/storage/innobase/include/fts0opt.h
@@ -0,0 +1,39 @@
+/*****************************************************************************
+
+Copyright (c) 2001, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0opt.h
+Full Text Search optimize thread
+
+Created 2011-02-15 Jimmy Yang
+***********************************************************************/
+#ifndef INNODB_FTS0OPT_H
+#define INNODB_FTS0OPT_H
+
+/** The FTS optimize thread's work queue. */
+extern ib_wqueue_t*	fts_optimize_wq;
+
+/********************************************************************
+Callback function to fetch the rows in an FTS INDEX record. */
+ibool
+fts_optimize_index_fetch_node(
+/*==========================*/
+                                        /* out: always returns non-NULL */
+        void*           row,		/* in: sel_node_t* */
+        void*           user_arg);	/* in: pointer to ib_vector_t */
+#endif
diff --git a/storage/innobase/include/fts0pars.h b/storage/innobase/include/fts0pars.h
new file mode 100644
index 00000000..8108e811
--- /dev/null
+++ b/storage/innobase/include/fts0pars.h
@@ -0,0 +1,72 @@
+/* A Bison parser, made by GNU Bison 2.5.  */
+
+/* Bison interface for Yacc-like parsers in C
+   
+      Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc.
+   
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* As a special exception, you may create a larger work that contains
+   part or all of the Bison parser skeleton and distribute that work
+   under terms of your choice, so long as that work isn't itself a
+   parser generator using the skeleton or a modified version thereof
+   as a parser skeleton.  Alternatively, if you modify or redistribute
+   the parser skeleton itself, you may (at your option) remove this
+   special exception, which will cause the skeleton and the resulting
+   Bison output files to be licensed under the GNU General Public
+   License without this special exception.
+   
+   This special exception was added by the Free Software Foundation in
+   version 2.2 of Bison.  */
+
+
+/* Tokens.  */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+   /* Put the tokens into the symbol table, so that GDB and other debuggers
+      know about them.  */
+   enum yytokentype {
+     FTS_OPER = 258,
+     FTS_TEXT = 259,
+     FTS_TERM = 260,
+     FTS_NUMB = 261
+   };
+#endif
+
+
+
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef union YYSTYPE
+{
+
+/* Line 2068 of yacc.c  */
+#line 61 "fts0pars.y"
+
+	int			oper;
+	fts_ast_string_t*	token;
+	fts_ast_node_t*		node;
+
+
+
+/* Line 2068 of yacc.c  */
+#line 64 "fts0pars.hh"
+} YYSTYPE;
+# define YYSTYPE_IS_TRIVIAL 1
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+#endif
+
+
+
+
diff --git a/storage/innobase/include/fts0plugin.h b/storage/innobase/include/fts0plugin.h
new file mode 100644
index 00000000..18ec2d6d
--- /dev/null
+++ b/storage/innobase/include/fts0plugin.h
@@ -0,0 +1,50 @@
+/*****************************************************************************
+
+Copyright (c) 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0plugin.h
+Full text search plugin header file
+
+Created 2013/06/04 Shaohua Wang
+***********************************************************************/
+
+#ifndef INNOBASE_FTS0PLUGIN_H
+#define INNOBASE_FTS0PLUGIN_H
+
+#include "univ.i"
+
+extern struct st_mysql_ftparser fts_default_parser;
+
+struct fts_ast_state_t;
+
+#define PARSER_INIT(parser, arg) if (parser->init) { parser->init(arg); }
+#define PARSER_DEINIT(parser, arg) if (parser->deinit) { parser->deinit(arg); }
+
+/******************************************************************//**
+fts parse query by plugin parser.
+@return 0 if parse successfully, or return non-zero. */
+int
+fts_parse_by_parser(
+/*================*/
+	ibool			mode,	/*!< in: query boolean mode */
+	uchar*			query,	/*!< in: query string */
+	ulint			len,	/*!< in: query string length */
+	st_mysql_ftparser*	parse,	/*!< in: fts plugin parser */
+	fts_ast_state_t*	state);	/*!< in: query parser state */
+
+#endif	/* INNOBASE_FTS0PLUGIN_H */
diff --git a/storage/innobase/include/fts0priv.h b/storage/innobase/include/fts0priv.h
new file mode 100644
index 00000000..ae0bb036
--- /dev/null
+++ b/storage/innobase/include/fts0priv.h
@@ -0,0 +1,485 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0priv.h
+Full text search internal header file
+
+Created 2011/09/02 Sunny Bains
+***********************************************************************/
+
+#ifndef INNOBASE_FTS0PRIV_H
+#define INNOBASE_FTS0PRIV_H
+
+#include "dict0dict.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "que0types.h"
+#include "fts0types.h"
+
+/* The various states of the FTS sub system pertaining to a table with
+FTS indexes defined on it. */
+enum fts_table_state_enum {
+					/* !<This must be 0 since we insert
+					a hard coded '0' at create time
+					to the config table */
+
+	FTS_TABLE_STATE_RUNNING = 0,	/*!< Auxiliary tables created OK */
+
+	FTS_TABLE_STATE_OPTIMIZING,	/*!< This is a substate of RUNNING */
+
+	FTS_TABLE_STATE_DELETED		/*!< All aux tables to be dropped when
+					it's safe to do so */
+};
+
+typedef enum fts_table_state_enum fts_table_state_t;
+
+/** The default time to wait for the background thread (in microsecnds). */
+#define FTS_MAX_BACKGROUND_THREAD_WAIT		10000
+
+/** Maximum number of iterations to wait before we complain */
+#define FTS_BACKGROUND_THREAD_WAIT_COUNT	1000
+
+/** The maximum length of the config table's value column in bytes */
+#define FTS_MAX_CONFIG_NAME_LEN			64
+
+/** The maximum length of the config table's value column in bytes */
+#define FTS_MAX_CONFIG_VALUE_LEN		1024
+
+/** Approx. upper limit of ilist length in bytes. */
+#define FTS_ILIST_MAX_SIZE			(64 * 1024)
+
+/** FTS config table name parameters */
+
+/** The number of seconds after which an OPTIMIZE run will stop */
+#define FTS_OPTIMIZE_LIMIT_IN_SECS	"optimize_checkpoint_limit"
+
+/** The next doc id */
+#define FTS_SYNCED_DOC_ID		"synced_doc_id"
+
+/** The last word that was OPTIMIZED */
+#define FTS_LAST_OPTIMIZED_WORD		"last_optimized_word"
+
+/** Total number of documents that have been deleted. The next_doc_id
+minus this count gives us the total number of documents. */
+#define FTS_TOTAL_DELETED_COUNT		"deleted_doc_count"
+
+/** Total number of words parsed from all documents */
+#define FTS_TOTAL_WORD_COUNT		"total_word_count"
+
+/** Start of optimize of an FTS index */
+#define FTS_OPTIMIZE_START_TIME		"optimize_start_time"
+
+/** End of optimize for an FTS index */
+#define FTS_OPTIMIZE_END_TIME		"optimize_end_time"
+
+/** User specified stopword table name */
+#define	FTS_STOPWORD_TABLE_NAME		"stopword_table_name"
+
+/** Whether to use (turn on/off) stopword */
+#define	FTS_USE_STOPWORD		"use_stopword"
+
+/** State of the FTS system for this table. It can be one of
+ RUNNING, OPTIMIZING, DELETED. */
+#define FTS_TABLE_STATE			"table_state"
+
+/** The minimum length of an FTS auxiliary table names's id component
+e.g., For an auxiliary table name
+
+	FTS_<TABLE_ID>_SUFFIX
+
+This constant is for the minimum length required to store the <TABLE_ID>
+component.
+*/
+#define FTS_AUX_MIN_TABLE_ID_LENGTH	48
+
+/** Maximum length of an integer stored in the config table value column. */
+#define FTS_MAX_INT_LEN			32
+
+/******************************************************************//**
+Parse an SQL string. %s is replaced with the table's id.
+@return query graph */
+que_t*
+fts_parse_sql(
+/*==========*/
+	fts_table_t*	fts_table,	/*!< in: FTS aux table */
+	pars_info_t*	info,		/*!< in: info struct, or NULL */
+	const char*	sql)		/*!< in: SQL string to evaluate */
+	MY_ATTRIBUTE((nonnull(3), malloc, warn_unused_result));
+/******************************************************************//**
+Evaluate a parsed SQL statement
+@return DB_SUCCESS or error code */
+dberr_t
+fts_eval_sql(
+/*=========*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t*		graph)		/*!< in: Parsed statement */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Construct the name of an internal FTS table for the given table.
+@param[in]	fts_table	metadata on fulltext-indexed table
+@param[out]	table_name	a name up to MAX_FULL_NAME_LEN
+@param[in]	dict_locked	whether dict_sys.latch is being held */
+void fts_get_table_name(const fts_table_t* fts_table, char* table_name,
+			bool dict_locked = false)
+	MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Construct the column specification part of the SQL string for selecting the
+indexed FTS columns for the given table. Adds the necessary bound
+ids to the given 'info' and returns the SQL string. Examples:
+
+One indexed column named "text":
+
+ "$sel0",
+ info/ids: sel0 -> "text"
+
+Two indexed columns named "subject" and "content":
+
+ "$sel0, $sel1",
+ info/ids: sel0 -> "subject", sel1 -> "content",
+@return heap-allocated WHERE string */
+const char*
+fts_get_select_columns_str(
+/*=======================*/
+	dict_index_t*	index,		/*!< in: FTS index */
+	pars_info_t*	info,		/*!< in/out: parser info */
+	mem_heap_t*	heap)		/*!< in: memory heap */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** define for fts_doc_fetch_by_doc_id() "option" value, defines whether
+we want to get Doc whose ID is equal to or greater or smaller than supplied
+ID */
+#define	FTS_FETCH_DOC_BY_ID_EQUAL	1
+#define	FTS_FETCH_DOC_BY_ID_LARGE	2
+#define	FTS_FETCH_DOC_BY_ID_SMALL	3
+
+/*************************************************************//**
+Fetch document (= a single row's indexed text) with the given
+document id.
+@return: DB_SUCCESS if fetch is successful, else error */
+dberr_t
+fts_doc_fetch_by_doc_id(
+/*====================*/
+	fts_get_doc_t*	get_doc,	/*!< in: state */
+	doc_id_t	doc_id,		/*!< in: id of document to fetch */
+	dict_index_t*	index_to_use,	/*!< in: caller supplied FTS index,
+					or NULL */
+	ulint		option,         /*!< in: search option, if it is
+                                        greater than doc_id or equal */
+	fts_sql_callback
+			callback,	/*!< in: callback to read
+					records */
+	void*		arg)		/*!< in: callback arg */
+	MY_ATTRIBUTE((nonnull(6)));
+
+/*******************************************************************//**
+Callback function for fetch that stores the text of an FTS document,
+converting each column to UTF-16.
+@return always FALSE */
+ibool
+fts_query_expansion_fetch_doc(
+/*==========================*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: fts_doc_t* */
+	MY_ATTRIBUTE((nonnull));
+/********************************************************************
+Write out a single word's data as new entry/entries in the INDEX table.
+@return DB_SUCCESS if all OK. */
+dberr_t
+fts_write_node(
+/*===========*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t**		graph,		/*!< in: query graph */
+	fts_table_t*	fts_table,	/*!< in: the FTS aux index */
+	fts_string_t*	word,		/*!< in: word in UTF-8 */
+	fts_node_t*	node)		/*!< in: node columns */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Check if a fts token is a stopword or less than fts_min_token_size
+or greater than fts_max_token_size.
+@param[in]	token		token string
+@param[in]	stopwords	stopwords rb tree
+@param[in]	cs		token charset
+@retval true	if it is not stopword and length in range
+@retval false	if it is stopword or length not in range */
+bool
+fts_check_token(
+	const fts_string_t*	token,
+	const ib_rbt_t*		stopwords,
+	const CHARSET_INFO*	cs);
+
+/******************************************************************//**
+Initialize a document. */
+void
+fts_doc_init(
+/*=========*/
+	fts_doc_t*	doc)		/*!< in: doc to initialize */
+	MY_ATTRIBUTE((nonnull));
+
+/******************************************************************//**
+Do a binary search for a doc id in the array
+@return +ve index if found -ve index where it should be
+        inserted if not found */
+int
+fts_bsearch(
+/*========*/
+	doc_id_t*	array,		/*!< in: array to sort */
+	int		lower,		/*!< in: lower bound of array*/
+	int		upper,		/*!< in: upper bound of array*/
+	doc_id_t	doc_id)		/*!< in: doc id to lookup */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Free document. */
+void
+fts_doc_free(
+/*=========*/
+	fts_doc_t*	doc)		/*!< in: document */
+	MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Free fts_optimizer_word_t instanace.*/
+void
+fts_word_free(
+/*==========*/
+	fts_word_t*	word)		/*!< in: instance to free.*/
+	MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Read the rows from the FTS inde
+@return DB_SUCCESS or error code */
+dberr_t
+fts_index_fetch_nodes(
+/*==================*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t**		graph,		/*!< in: prepared statement */
+	fts_table_t*	fts_table,	/*!< in: FTS aux table */
+	const fts_string_t*
+			word,		/*!< in: the word to fetch */
+	fts_fetch_t*	fetch)		/*!< in: fetch callback.*/
+	MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Compare two fts_trx_table_t instances, we actually compare the
+table id's here.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_cmp(
+/*==============*/
+	const void*	v1,		/*!< in: id1 */
+	const void*	v2)		/*!< in: id2 */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Compare a table id with a trx_table_t table id.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_id_cmp(
+/*=================*/
+	const void*	p1,		/*!< in: id1 */
+	const void*	p2)		/*!< in: id2 */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+#define fts_sql_commit(trx) trx_commit_for_mysql(trx)
+#define fts_sql_rollback(trx) (trx)->rollback()
+/******************************************************************//**
+Get value from config table. The caller must ensure that enough
+space is allocated for value to hold the column contents
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_get_value(
+/*=================*/
+	trx_t*		trx,		/* transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: get config value for
+					this parameter name */
+	fts_string_t*	value)		/*!< out: value read from
+					config table */
+	MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Get value specific to an FTS index from the config table. The caller
+must ensure that enough space is allocated for value to hold the
+column contents.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_get_index_value(
+/*=======================*/
+	trx_t*		trx,		/*!< transaction */
+	dict_index_t*	index,		/*!< in: index */
+	const char*	param,		/*!< in: get config value for
+					this parameter name */
+	fts_string_t*	value)		/*!< out: value read from
+					config table */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Set the value in the config table for name.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_set_value(
+/*=================*/
+	trx_t*		trx,		/*!< transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: get config value for
+					this parameter name */
+	const fts_string_t*
+			value)		/*!< in: value to update */
+	MY_ATTRIBUTE((nonnull));
+/****************************************************************//**
+Set an ulint value in the config table.
+@return DB_SUCCESS if all OK else error code */
+dberr_t
+fts_config_set_ulint(
+/*=================*/
+	trx_t*		trx,		/*!< in: transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: param name */
+	ulint		int_value)	/*!< in: value */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Set the value specific to an FTS index in the config table.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_set_index_value(
+/*=======================*/
+	trx_t*		trx,		/*!< transaction */
+	dict_index_t*	index,		/*!< in: index */
+	const char*	param,		/*!< in: get config value for
+					this parameter name */
+	fts_string_t*	value)		/*!< out: value read from
+					config table */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#ifdef FTS_OPTIMIZE_DEBUG
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_get_index_ulint(
+/*=======================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: FTS index */
+	const char*	name,		/*!< in: param name */
+	ulint*		int_value)	/*!< out: value */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+#endif /* FTS_OPTIMIZE_DEBUG */
+
+/******************************************************************//**
+Set an ulint value int the config table.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_set_index_ulint(
+/*=======================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: FTS index */
+	const char*	name,		/*!< in: param name */
+	ulint		int_value)	/*!< in: value */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_get_ulint(
+/*=================*/
+	trx_t*		trx,		/*!< in: transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: param name */
+	ulint*		int_value)	/*!< out: value */
+	MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Search cache for word.
+@return the word node vector if found else NULL */
+const ib_vector_t*
+fts_cache_find_word(
+/*================*/
+	const fts_index_cache_t*
+			index_cache,	/*!< in: cache to search */
+	const fts_string_t*
+			text)		/*!< in: word to search for */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/******************************************************************//**
+Append deleted doc ids to vector and sort the vector. */
+void
+fts_cache_append_deleted_doc_ids(
+/*=============================*/
+	fts_cache_t*	cache,		/*!< in: cache to use */
+	ib_vector_t*	vector);	/*!< in: append to this vector */
+/******************************************************************//**
+Search the index specific cache for a particular FTS index.
+@return the index specific cache else NULL */
+fts_index_cache_t*
+fts_find_index_cache(
+/*================*/
+	const fts_cache_t*
+			cache,		/*!< in: cache to search */
+	const dict_index_t*
+			index)		/*!< in: index to search for */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Write the table id to the given buffer (including final NUL). Buffer must be
+at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long.
+@return number of bytes written */
+UNIV_INLINE
+int
+fts_write_object_id(
+/*================*/
+	ib_id_t		id,		/*!< in: a table/index id */
+	char*		str);		/*!< in: buffer to write the id to */
+/******************************************************************//**
+Read the table id from the string generated by fts_write_object_id().
+@return TRUE if parse successful */
+UNIV_INLINE
+ibool
+fts_read_object_id(
+/*===============*/
+	ib_id_t*	id,		/*!< out: a table id */
+	const char*	str)		/*!< in: buffer to read from */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Get the table id.
+@return number of bytes written */
+int
+fts_get_table_id(
+/*=============*/
+	const fts_table_t*
+			fts_table,	/*!< in: FTS Auxiliary table */
+	char*		table_id)	/*!< out: table id, must be at least
+					FTS_AUX_MIN_TABLE_ID_LENGTH bytes
+					long */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Add node positions. */
+void
+fts_cache_node_add_positions(
+/*=========================*/
+	fts_cache_t*	cache,		/*!< in: cache */
+	fts_node_t*	node,		/*!< in: word node */
+	doc_id_t	doc_id,		/*!< in: doc id */
+	ib_vector_t*	positions)	/*!< in: fts_token_t::positions */
+	MY_ATTRIBUTE((nonnull(2,4)));
+
+/******************************************************************//**
+Create the config table name for retrieving index specific value.
+@return index config parameter name */
+char*
+fts_config_create_index_param_name(
+/*===============================*/
+	const char*		param,	/*!< in: base name of param */
+	const dict_index_t*	index)	/*!< in: index for config */
+	MY_ATTRIBUTE((nonnull, malloc, warn_unused_result));
+
+#include "fts0priv.inl"
+
+#endif /* INNOBASE_FTS0PRIV_H */
diff --git a/storage/innobase/include/fts0priv.inl b/storage/innobase/include/fts0priv.inl
new file mode 100644
index 00000000..da14cfcb
--- /dev/null
+++ b/storage/innobase/include/fts0priv.inl
@@ -0,0 +1,121 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0priv.ic
+Full text search internal header file
+
+Created 2011/11/12 Sunny Bains
+***********************************************************************/
+
+/******************************************************************//**
+Write the table id to the given buffer (including final NUL). Buffer must be
+at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long.
+@return number of bytes written */
+UNIV_INLINE
+int
+fts_write_object_id(
+/*================*/
+	ib_id_t		id,		/* in: a table/index id */
+	char*		str)		/* in: buffer to write the id to */
+{
+
+#ifdef _WIN32
+
+	DBUG_EXECUTE_IF("innodb_test_wrong_non_windows_fts_aux_table_name",
+			return(sprintf(str, UINT64PFx, id)););
+
+	/* Use this to construct old(5.6.14 and 5.7.3) windows
+	ambiguous aux table names */
+	DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+			return(sprintf(str, "%016llu", (ulonglong) id)););
+
+#else /* _WIN32 */
+
+	/* Use this to construct old(5.6.14 and 5.7.3) windows
+	ambiguous aux table names */
+	DBUG_EXECUTE_IF("innodb_test_wrong_windows_fts_aux_table_name",
+			return(sprintf(str, "%016llu", (ulonglong) id)););
+
+	DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+			return(sprintf(str, "%016llx", (ulonglong) id)););
+
+#endif /* _WIN32 */
+
+	return(sprintf(str, "%016llx", (ulonglong) id));
+}
+
+/******************************************************************//**
+Read the table id from the string generated by fts_write_object_id().
+@return TRUE if parse successful */
+UNIV_INLINE
+ibool
+fts_read_object_id(
+/*===============*/
+	ib_id_t*	id,		/* out: an id */
+	const char*	str)		/* in: buffer to read from */
+{
+	/* NOTE: this func doesn't care about whether current table
+	is set with HEX_NAME, the user of the id read here will check
+	if the id is HEX or DEC and do the right thing with it. */
+	return(sscanf(str, UINT64PFx, id) == 1);
+}
+
+/******************************************************************//**
+Compare two fts_trx_table_t instances.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_cmp(
+/*==============*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const dict_table_t*	table1
+		= (*static_cast<const fts_trx_table_t* const*>(p1))->table;
+
+	const dict_table_t*	table2
+		= (*static_cast<const fts_trx_table_t* const*>(p2))->table;
+
+	return((table1->id > table2->id)
+	       ? 1
+	       : (table1->id == table2->id)
+		  ? 0
+		  : -1);
+}
+
+/******************************************************************//**
+Compare a table id with a fts_trx_table_t table id.
+@return < 0 if n1 < n2, 0 if n1 == n2,> 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_id_cmp(
+/*=================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const uintmax_t*	table_id = static_cast<const uintmax_t*>(p1);
+	const dict_table_t*	table2
+		= (*static_cast<const fts_trx_table_t* const*>(p2))->table;
+
+	return((*table_id > table2->id)
+	       ? 1
+	       : (*table_id == table2->id)
+		  ? 0
+		  : -1);
+}
diff --git a/storage/innobase/include/fts0tlex.h b/storage/innobase/include/fts0tlex.h
new file mode 100644
index 00000000..89655ca1
--- /dev/null
+++ b/storage/innobase/include/fts0tlex.h
@@ -0,0 +1,702 @@
+#ifndef fts0tHEADER_H
+#define fts0tHEADER_H 1
+#define fts0tIN_HEADER 1
+
+#line 6 "../include/fts0tlex.h"
+
+#line 8 "../include/fts0tlex.h"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 6
+#define YY_FLEX_SUBMINOR_VERSION 4
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+#ifdef yy_create_buffer
+#define fts0t_create_buffer_ALREADY_DEFINED
+#else
+#define yy_create_buffer fts0t_create_buffer
+#endif
+
+#ifdef yy_delete_buffer
+#define fts0t_delete_buffer_ALREADY_DEFINED
+#else
+#define yy_delete_buffer fts0t_delete_buffer
+#endif
+
+#ifdef yy_scan_buffer
+#define fts0t_scan_buffer_ALREADY_DEFINED
+#else
+#define yy_scan_buffer fts0t_scan_buffer
+#endif
+
+#ifdef yy_scan_string
+#define fts0t_scan_string_ALREADY_DEFINED
+#else
+#define yy_scan_string fts0t_scan_string
+#endif
+
+#ifdef yy_scan_bytes
+#define fts0t_scan_bytes_ALREADY_DEFINED
+#else
+#define yy_scan_bytes fts0t_scan_bytes
+#endif
+
+#ifdef yy_init_buffer
+#define fts0t_init_buffer_ALREADY_DEFINED
+#else
+#define yy_init_buffer fts0t_init_buffer
+#endif
+
+#ifdef yy_flush_buffer
+#define fts0t_flush_buffer_ALREADY_DEFINED
+#else
+#define yy_flush_buffer fts0t_flush_buffer
+#endif
+
+#ifdef yy_load_buffer_state
+#define fts0t_load_buffer_state_ALREADY_DEFINED
+#else
+#define yy_load_buffer_state fts0t_load_buffer_state
+#endif
+
+#ifdef yy_switch_to_buffer
+#define fts0t_switch_to_buffer_ALREADY_DEFINED
+#else
+#define yy_switch_to_buffer fts0t_switch_to_buffer
+#endif
+
+#ifdef yypush_buffer_state
+#define fts0tpush_buffer_state_ALREADY_DEFINED
+#else
+#define yypush_buffer_state fts0tpush_buffer_state
+#endif
+
+#ifdef yypop_buffer_state
+#define fts0tpop_buffer_state_ALREADY_DEFINED
+#else
+#define yypop_buffer_state fts0tpop_buffer_state
+#endif
+
+#ifdef yyensure_buffer_stack
+#define fts0tensure_buffer_stack_ALREADY_DEFINED
+#else
+#define yyensure_buffer_stack fts0tensure_buffer_stack
+#endif
+
+#ifdef yylex
+#define fts0tlex_ALREADY_DEFINED
+#else
+#define yylex fts0tlex
+#endif
+
+#ifdef yyrestart
+#define fts0trestart_ALREADY_DEFINED
+#else
+#define yyrestart fts0trestart
+#endif
+
+#ifdef yylex_init
+#define fts0tlex_init_ALREADY_DEFINED
+#else
+#define yylex_init fts0tlex_init
+#endif
+
+#ifdef yylex_init_extra
+#define fts0tlex_init_extra_ALREADY_DEFINED
+#else
+#define yylex_init_extra fts0tlex_init_extra
+#endif
+
+#ifdef yylex_destroy
+#define fts0tlex_destroy_ALREADY_DEFINED
+#else
+#define yylex_destroy fts0tlex_destroy
+#endif
+
+#ifdef yyget_debug
+#define fts0tget_debug_ALREADY_DEFINED
+#else
+#define yyget_debug fts0tget_debug
+#endif
+
+#ifdef yyset_debug
+#define fts0tset_debug_ALREADY_DEFINED
+#else
+#define yyset_debug fts0tset_debug
+#endif
+
+#ifdef yyget_extra
+#define fts0tget_extra_ALREADY_DEFINED
+#else
+#define yyget_extra fts0tget_extra
+#endif
+
+#ifdef yyset_extra
+#define fts0tset_extra_ALREADY_DEFINED
+#else
+#define yyset_extra fts0tset_extra
+#endif
+
+#ifdef yyget_in
+#define fts0tget_in_ALREADY_DEFINED
+#else
+#define yyget_in fts0tget_in
+#endif
+
+#ifdef yyset_in
+#define fts0tset_in_ALREADY_DEFINED
+#else
+#define yyset_in fts0tset_in
+#endif
+
+#ifdef yyget_out
+#define fts0tget_out_ALREADY_DEFINED
+#else
+#define yyget_out fts0tget_out
+#endif
+
+#ifdef yyset_out
+#define fts0tset_out_ALREADY_DEFINED
+#else
+#define yyset_out fts0tset_out
+#endif
+
+#ifdef yyget_leng
+#define fts0tget_leng_ALREADY_DEFINED
+#else
+#define yyget_leng fts0tget_leng
+#endif
+
+#ifdef yyget_text
+#define fts0tget_text_ALREADY_DEFINED
+#else
+#define yyget_text fts0tget_text
+#endif
+
+#ifdef yyget_lineno
+#define fts0tget_lineno_ALREADY_DEFINED
+#else
+#define yyget_lineno fts0tget_lineno
+#endif
+
+#ifdef yyset_lineno
+#define fts0tset_lineno_ALREADY_DEFINED
+#else
+#define yyset_lineno fts0tset_lineno
+#endif
+
+#ifdef yyget_column
+#define fts0tget_column_ALREADY_DEFINED
+#else
+#define yyget_column fts0tget_column
+#endif
+
+#ifdef yyset_column
+#define fts0tset_column_ALREADY_DEFINED
+#else
+#define yyset_column fts0tset_column
+#endif
+
+#ifdef yywrap
+#define fts0twrap_ALREADY_DEFINED
+#else
+#define yywrap fts0twrap
+#endif
+
+#ifdef yyalloc
+#define fts0talloc_ALREADY_DEFINED
+#else
+#define yyalloc fts0talloc
+#endif
+
+#ifdef yyrealloc
+#define fts0trealloc_ALREADY_DEFINED
+#else
+#define yyrealloc fts0trealloc
+#endif
+
+#ifdef yyfree
+#define fts0tfree_ALREADY_DEFINED
+#else
+#define yyfree fts0tfree
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types. 
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t; 
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#ifndef SIZE_MAX
+#define SIZE_MAX               (~(size_t)0)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+/* begin standard C++ headers. */
+
+/* TODO: this is always defined, so inline it */
+#define yyconst const
+
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define yynoreturn __attribute__((__noreturn__))
+#else
+#define yynoreturn
+#endif
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	int yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	int yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+void yyrestart ( FILE *input_file , yyscan_t yyscanner );
+void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size , yyscan_t yyscanner );
+void yy_delete_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yy_flush_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yypush_buffer_state ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+void yypop_buffer_state ( yyscan_t yyscanner );
+
+YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_string ( const char *yy_str , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len , yyscan_t yyscanner );
+
+void *yyalloc ( yy_size_t , yyscan_t yyscanner );
+void *yyrealloc ( void *, yy_size_t , yyscan_t yyscanner );
+void yyfree ( void * , yyscan_t yyscanner );
+
+/* Begin user sect3 */
+
+#define fts0twrap(yyscanner) (/*CONSTCOND*/1)
+#define YY_SKIP_YYWRAP
+
+#define yytext_ptr yytext_r
+
+#ifdef YY_HEADER_EXPORT_START_CONDITIONS
+#define INITIAL 0
+
+#endif
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+int yylex_init (yyscan_t* scanner);
+
+int yylex_init_extra ( YY_EXTRA_TYPE user_defined, yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int yylex_destroy ( yyscan_t yyscanner );
+
+int yyget_debug ( yyscan_t yyscanner );
+
+void yyset_debug ( int debug_flag , yyscan_t yyscanner );
+
+YY_EXTRA_TYPE yyget_extra ( yyscan_t yyscanner );
+
+void yyset_extra ( YY_EXTRA_TYPE user_defined , yyscan_t yyscanner );
+
+FILE *yyget_in ( yyscan_t yyscanner );
+
+void yyset_in  ( FILE * _in_str , yyscan_t yyscanner );
+
+FILE *yyget_out ( yyscan_t yyscanner );
+
+void yyset_out  ( FILE * _out_str , yyscan_t yyscanner );
+
+			int yyget_leng ( yyscan_t yyscanner );
+
+char *yyget_text ( yyscan_t yyscanner );
+
+int yyget_lineno ( yyscan_t yyscanner );
+
+void yyset_lineno ( int _line_number , yyscan_t yyscanner );
+
+int yyget_column  ( yyscan_t yyscanner );
+
+void yyset_column ( int _column_no , yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int yywrap ( yyscan_t yyscanner );
+#else
+extern int yywrap ( yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy ( char *, const char *, int , yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen ( const char * , yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int yylex (yyscan_t yyscanner);
+
+#define YY_DECL int yylex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+#undef YY_NEW_FILE
+#undef YY_FLUSH_BUFFER
+#undef yy_set_bol
+#undef yy_new_buffer
+#undef yy_set_interactive
+#undef YY_DO_BEFORE_ACTION
+
+#ifdef YY_DECL_IS_OURS
+#undef YY_DECL_IS_OURS
+#undef YY_DECL
+#endif
+
+#ifndef fts0t_create_buffer_ALREADY_DEFINED
+#undef yy_create_buffer
+#endif
+#ifndef fts0t_delete_buffer_ALREADY_DEFINED
+#undef yy_delete_buffer
+#endif
+#ifndef fts0t_scan_buffer_ALREADY_DEFINED
+#undef yy_scan_buffer
+#endif
+#ifndef fts0t_scan_string_ALREADY_DEFINED
+#undef yy_scan_string
+#endif
+#ifndef fts0t_scan_bytes_ALREADY_DEFINED
+#undef yy_scan_bytes
+#endif
+#ifndef fts0t_init_buffer_ALREADY_DEFINED
+#undef yy_init_buffer
+#endif
+#ifndef fts0t_flush_buffer_ALREADY_DEFINED
+#undef yy_flush_buffer
+#endif
+#ifndef fts0t_load_buffer_state_ALREADY_DEFINED
+#undef yy_load_buffer_state
+#endif
+#ifndef fts0t_switch_to_buffer_ALREADY_DEFINED
+#undef yy_switch_to_buffer
+#endif
+#ifndef fts0tpush_buffer_state_ALREADY_DEFINED
+#undef yypush_buffer_state
+#endif
+#ifndef fts0tpop_buffer_state_ALREADY_DEFINED
+#undef yypop_buffer_state
+#endif
+#ifndef fts0tensure_buffer_stack_ALREADY_DEFINED
+#undef yyensure_buffer_stack
+#endif
+#ifndef fts0tlex_ALREADY_DEFINED
+#undef yylex
+#endif
+#ifndef fts0trestart_ALREADY_DEFINED
+#undef yyrestart
+#endif
+#ifndef fts0tlex_init_ALREADY_DEFINED
+#undef yylex_init
+#endif
+#ifndef fts0tlex_init_extra_ALREADY_DEFINED
+#undef yylex_init_extra
+#endif
+#ifndef fts0tlex_destroy_ALREADY_DEFINED
+#undef yylex_destroy
+#endif
+#ifndef fts0tget_debug_ALREADY_DEFINED
+#undef yyget_debug
+#endif
+#ifndef fts0tset_debug_ALREADY_DEFINED
+#undef yyset_debug
+#endif
+#ifndef fts0tget_extra_ALREADY_DEFINED
+#undef yyget_extra
+#endif
+#ifndef fts0tset_extra_ALREADY_DEFINED
+#undef yyset_extra
+#endif
+#ifndef fts0tget_in_ALREADY_DEFINED
+#undef yyget_in
+#endif
+#ifndef fts0tset_in_ALREADY_DEFINED
+#undef yyset_in
+#endif
+#ifndef fts0tget_out_ALREADY_DEFINED
+#undef yyget_out
+#endif
+#ifndef fts0tset_out_ALREADY_DEFINED
+#undef yyset_out
+#endif
+#ifndef fts0tget_leng_ALREADY_DEFINED
+#undef yyget_leng
+#endif
+#ifndef fts0tget_text_ALREADY_DEFINED
+#undef yyget_text
+#endif
+#ifndef fts0tget_lineno_ALREADY_DEFINED
+#undef yyget_lineno
+#endif
+#ifndef fts0tset_lineno_ALREADY_DEFINED
+#undef yyset_lineno
+#endif
+#ifndef fts0tget_column_ALREADY_DEFINED
+#undef yyget_column
+#endif
+#ifndef fts0tset_column_ALREADY_DEFINED
+#undef yyset_column
+#endif
+#ifndef fts0twrap_ALREADY_DEFINED
+#undef yywrap
+#endif
+#ifndef fts0tget_lval_ALREADY_DEFINED
+#undef yyget_lval
+#endif
+#ifndef fts0tset_lval_ALREADY_DEFINED
+#undef yyset_lval
+#endif
+#ifndef fts0tget_lloc_ALREADY_DEFINED
+#undef yyget_lloc
+#endif
+#ifndef fts0tset_lloc_ALREADY_DEFINED
+#undef yyset_lloc
+#endif
+#ifndef fts0talloc_ALREADY_DEFINED
+#undef yyalloc
+#endif
+#ifndef fts0trealloc_ALREADY_DEFINED
+#undef yyrealloc
+#endif
+#ifndef fts0tfree_ALREADY_DEFINED
+#undef yyfree
+#endif
+#ifndef fts0ttext_ALREADY_DEFINED
+#undef yytext
+#endif
+#ifndef fts0tleng_ALREADY_DEFINED
+#undef yyleng
+#endif
+#ifndef fts0tin_ALREADY_DEFINED
+#undef yyin
+#endif
+#ifndef fts0tout_ALREADY_DEFINED
+#undef yyout
+#endif
+#ifndef fts0t_flex_debug_ALREADY_DEFINED
+#undef yy_flex_debug
+#endif
+#ifndef fts0tlineno_ALREADY_DEFINED
+#undef yylineno
+#endif
+#ifndef fts0ttables_fload_ALREADY_DEFINED
+#undef yytables_fload
+#endif
+#ifndef fts0ttables_destroy_ALREADY_DEFINED
+#undef yytables_destroy
+#endif
+#ifndef fts0tTABLES_NAME_ALREADY_DEFINED
+#undef yyTABLES_NAME
+#endif
+
+#line 69 "fts0tlex.l"
+
+
+#line 701 "../include/fts0tlex.h"
+#undef fts0tIN_HEADER
+#endif /* fts0tHEADER_H */
diff --git a/storage/innobase/include/fts0tokenize.h b/storage/innobase/include/fts0tokenize.h
new file mode 100644
index 00000000..1cddaf5b
--- /dev/null
+++ b/storage/innobase/include/fts0tokenize.h
@@ -0,0 +1,189 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fts/fts0tokenize.cc
+Full Text Search plugin tokenizer refer to MyISAM
+
+Created 2014/11/17 Shaohua Wang
+***********************************************************************/
+
+#include "ft_global.h"
+#include "mysql/plugin_ftparser.h"
+#include "m_ctype.h"
+
+/* Macros and structs below are from ftdefs.h in MyISAM */
+/** Check a char is true word */
+#define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_')
+
+/** Check if a char is misc word */
+#define misc_word_char(X)       0
+
+/** Boolean search syntax */
+static const char* fts_boolean_syntax = DEFAULT_FTB_SYNTAX;
+
+#define FTB_YES   (fts_boolean_syntax[0])
+#define FTB_EGAL  (fts_boolean_syntax[1])
+#define FTB_NO    (fts_boolean_syntax[2])
+#define FTB_INC   (fts_boolean_syntax[3])
+#define FTB_DEC   (fts_boolean_syntax[4])
+#define FTB_LBR   (fts_boolean_syntax[5])
+#define FTB_RBR   (fts_boolean_syntax[6])
+#define FTB_NEG   (fts_boolean_syntax[7])
+#define FTB_TRUNC (fts_boolean_syntax[8])
+#define FTB_LQUOT (fts_boolean_syntax[10])
+#define FTB_RQUOT (fts_boolean_syntax[11])
+
+/** FTS query token */
+typedef struct st_ft_word {
+        uchar* pos;     /*!< word start pointer */
+        uint   len;     /*!< word len */
+        double weight;  /*!< word weight, unused in innodb */
+} FT_WORD;
+
+/** Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM.
+Differences: a. code format changed; b. stopword processing removed.
+@param[in]	cs	charset
+@param[in,out]	start	doc start pointer
+@param[in,out]	end	doc end pointer
+@param[in,out]	word	token
+@param[in,out]	info	token info
+@retval	0	eof
+@retval	1	word found
+@retval	2	left bracket
+@retval	3	right bracket
+@retval	4	stopword found */
+inline
+uchar
+fts_get_word(
+	const CHARSET_INFO*	cs,
+	uchar**			start,
+	uchar*			end,
+	FT_WORD*		word,
+	MYSQL_FTPARSER_BOOLEAN_INFO*
+				info)
+{
+	uchar*	doc = *start;
+	int	ctype;
+	uint	mwc;
+	uint	length;
+	int	mbl;
+
+	info->yesno = (FTB_YES ==' ') ? 1 : (info->quot != 0);
+	info->weight_adjust = info->wasign = 0;
+	info->type = FT_TOKEN_EOF;
+
+	while (doc < end) {
+		for (; doc < end;
+		     doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
+			mbl = cs->ctype(&ctype, doc, end);
+
+			if (true_word_char(ctype, *doc)) {
+				break;
+			}
+
+			if (*doc == FTB_RQUOT && info->quot) {
+				*start = doc + 1;
+				info->type = FT_TOKEN_RIGHT_PAREN;
+
+				return(info->type);
+			}
+
+			if (!info->quot) {
+				if (*doc == FTB_LBR
+				    || *doc == FTB_RBR
+				    || *doc == FTB_LQUOT) {
+					/* param->prev=' '; */
+					*start = doc + 1;
+					if (*doc == FTB_LQUOT) {
+						info->quot = (char*)1;
+					}
+
+					info->type = (*doc == FTB_RBR ?
+						       FT_TOKEN_RIGHT_PAREN :
+						       FT_TOKEN_LEFT_PAREN);
+
+					return(info->type);
+				}
+
+				if (info->prev == ' ') {
+					if (*doc == FTB_YES) {
+						info->yesno = +1;
+						continue;
+					} else if (*doc == FTB_EGAL) {
+						info->yesno = 0;
+						continue;
+					} else if (*doc == FTB_NO) {
+						info->yesno = -1;
+						continue;
+					} else if (*doc == FTB_INC) {
+						info->weight_adjust++;
+						continue;
+					} else if (*doc == FTB_DEC) {
+						info->weight_adjust--;
+						continue;
+					} else if (*doc == FTB_NEG) {
+						info->wasign = !info->wasign;
+						continue;
+					}
+				}
+			}
+
+			info->prev = char(*doc);
+			info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != 0);
+			info->weight_adjust = info->wasign = 0;
+		}
+
+		mwc = length = 0;
+		for (word->pos = doc;
+		     doc < end;
+		     length++, doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
+			mbl = cs->ctype(&ctype, doc, end);
+
+			if (true_word_char(ctype, *doc)) {
+				mwc = 0;
+			} else if (!misc_word_char(*doc) || mwc) {
+				break;
+			} else {
+				mwc++;
+			}
+		}
+
+		/* Be sure *prev is true_word_char. */
+		info->prev = 'A';
+		word->len = (uint)(doc-word->pos) - mwc;
+
+		if ((info->trunc = (doc < end && *doc == FTB_TRUNC))) {
+			doc++;
+		}
+
+		/* We don't check stopword here. */
+		*start = doc;
+		info->type = FT_TOKEN_WORD;
+
+		return(info->type);
+	}
+
+	if (info->quot) {
+		*start = doc;
+		info->type = FT_TOKEN_RIGHT_PAREN;
+	}
+
+	return(info->type);
+}
diff --git a/storage/innobase/include/fts0types.h b/storage/innobase/include/fts0types.h
new file mode 100644
index 00000000..fb278d54
--- /dev/null
+++ b/storage/innobase/include/fts0types.h
@@ -0,0 +1,354 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0types.h
+Full text search types file
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FTS0TYPES_H
+#define INNOBASE_FTS0TYPES_H
+
+#include "fts0fts.h"
+#include "pars0pars.h"
+#include "que0types.h"
+#include "ut0byte.h"
+#include "ut0rbt.h"
+
+/** Types used within FTS. */
+struct fts_que_t;
+struct fts_node_t;
+
+/** Callbacks used within FTS. */
+typedef pars_user_func_cb_t fts_sql_callback;
+typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len);
+
+/** Statistics relevant to a particular document, used during retrieval. */
+struct fts_doc_stats_t {
+	doc_id_t	doc_id;		/*!< Document id */
+	ulint		word_count;	/*!< Total words in the document */
+};
+
+/** It's main purpose is to store the SQL prepared statements that
+are required to retrieve a document from the database. */
+struct fts_get_doc_t {
+	fts_index_cache_t*
+			index_cache;	/*!< The index cache instance */
+
+					/*!< Parsed sql statement */
+	que_t*		get_document_graph;
+	fts_cache_t*	cache;		/*!< The parent cache */
+};
+
+/** Since we can have multiple FTS indexes on a table, we keep a
+per index cache of words etc. */
+struct fts_index_cache_t {
+	dict_index_t*	index;		/*!< The FTS index instance */
+
+	ib_rbt_t*	words;		/*!< Nodes; indexed by fts_string_t*,
+					cells are fts_tokenizer_word_t*.*/
+
+	ib_vector_t*	doc_stats;	/*!< Array of the fts_doc_stats_t
+					contained in the memory buffer.
+					Must be in sorted order (ascending).
+					The  ideal choice is an rb tree but
+					the rb tree imposes a space overhead
+					that we can do without */
+
+	que_t**		ins_graph;	/*!< Insert query graphs */
+
+	que_t**		sel_graph;	/*!< Select query graphs */
+	CHARSET_INFO*	charset;	/*!< charset */
+};
+
+/** Stop word control infotmation. */
+struct fts_stopword_t {
+	ulint		status;		/*!< Status of the stopword tree */
+	ib_alloc_t*	heap;		/*!< The memory allocator to use */
+	ib_rbt_t*	cached_stopword;/*!< This stores all active stopwords */
+	CHARSET_INFO*	charset;	/*!< charset for stopword */
+};
+
+/** The SYNC state of the cache. There is one instance of this struct
+associated with each ADD thread. */
+struct fts_sync_t {
+	trx_t*		trx;		/*!< The transaction used for SYNCing
+					the cache to disk */
+	dict_table_t*	table;		/*!< Table with FTS index(es) */
+	ulint		max_cache_size;	/*!< Max size in bytes of the cache */
+	ibool		cache_full;	/*!< flag, when true it indicates that
+					we need to sync the cache to disk */
+	ulint		lower_index;	/*!< the start index of the doc id
+					vector from where to start adding
+					documents to the FTS cache */
+	ulint		upper_index;	/*!< max index of the doc id vector to
+					add to the FTS cache */
+	ibool		interrupted;	/*!< TRUE if SYNC was interrupted */
+	doc_id_t	min_doc_id;	/*!< The smallest doc id added to the
+					cache. It should equal to
+					doc_ids[lower_index] */
+	doc_id_t	max_doc_id;	/*!< The doc id at which the cache was
+					noted as being full, we use this to
+					set the upper_limit field */
+	time_t		start_time;	/*!< SYNC start time; only used if
+					fts_enable_diag_print */
+	bool		in_progress;	/*!< flag whether sync is in progress.*/
+	bool		unlock_cache;	/*!< flag whether unlock cache when
+					write fts node */
+  /** condition variable for in_progress; used with table->fts->cache->lock */
+  pthread_cond_t cond;
+};
+
+/** The cache for the FTS system. It is a memory-based inverted index
+that new entries are added to, until it grows over the configured maximum
+size, at which time its contents are written to the INDEX table. */
+struct fts_cache_t
+{
+  /** lock protecting all access to the memory buffer */
+  mysql_mutex_t lock;
+  /** cache initialization */
+  mysql_mutex_t init_lock;
+
+  /** protection for deleted_doc_ids */
+  mysql_mutex_t deleted_lock;
+
+  /** protection for DOC_ID */
+  mysql_mutex_t	doc_id_lock;
+
+	ib_vector_t*	deleted_doc_ids;/*!< Array of deleted doc ids, each
+					element is of type fts_update_t */
+
+	ib_vector_t*	indexes;	/*!< We store the stats and inverted
+					index for the individual FTS indexes
+					in this vector. Each element is
+					an instance of fts_index_cache_t */
+
+	ib_vector_t*	get_docs;	/*!< information required to read
+					the document from the table. Each
+					element is of type fts_doc_t */
+
+	size_t		total_size;	/*!< total size consumed by the ilist
+					field of all nodes. SYNC is run
+					whenever this gets too big */
+	/** total_size at the time of the previous SYNC request */
+	size_t		total_size_at_sync;
+
+	fts_sync_t*	sync;		/*!< sync structure to sync data to
+					disk */
+	ib_alloc_t*	sync_heap;	/*!< The heap allocator, for indexes
+					and deleted_doc_ids, ie. transient
+					objects, they are recreated after
+					a SYNC is completed */
+
+	ib_alloc_t*	self_heap;	/*!< This heap is the heap out of
+					which an instance of the cache itself
+					was created. Objects created using
+					this heap will last for the lifetime
+					of the cache */
+
+	doc_id_t	next_doc_id;	/*!< Next doc id */
+
+	doc_id_t	synced_doc_id;	/*!< Doc ID sync-ed to CONFIG table */
+
+	doc_id_t	first_doc_id;	/*!< first doc id since this table
+					was opened */
+
+	ulint		deleted;	/*!< Number of doc ids deleted since
+					last optimized. This variable is
+					covered by deleted_lock */
+
+	ulint		added;		/*!< Number of doc ids added since last
+					optimized. This variable is covered by
+					the deleted lock */
+
+	fts_stopword_t	stopword_info;	/*!< Cached stopwords for the FTS */
+	mem_heap_t*	cache_heap;	/*!< Cache Heap */
+};
+
+/** Columns of the FTS auxiliary INDEX table */
+struct fts_node_t {
+	doc_id_t	first_doc_id;	/*!< First document id in ilist. */
+
+	doc_id_t	last_doc_id;	/*!< Last document id in ilist. */
+
+	byte*		ilist;		/*!< Binary list of documents & word
+					positions the token appears in.
+					TODO: For now, these are simply
+					ut_malloc'd, but if testing shows
+					that they waste memory unacceptably, a
+					special memory allocator will have
+					to be written */
+
+	ulint		doc_count;	/*!< Number of doc ids in ilist */
+
+	ulint		ilist_size;	/*!< Used size of ilist in bytes. */
+
+	ulint		ilist_size_alloc;
+					/*!< Allocated size of ilist in
+					bytes */
+	bool		synced;		/*!< flag whether the node is synced */
+};
+
+/** A tokenizer word. Contains information about one word. */
+struct fts_tokenizer_word_t {
+	fts_string_t	text;		/*!< Token text. */
+
+	ib_vector_t*	nodes;		/*!< Word node ilists, each element is
+					of type fts_node_t */
+};
+
+/** Word text plus it's array of nodes as on disk in FTS index */
+struct fts_word_t {
+	fts_string_t	text;		/*!< Word value in UTF-8 */
+	ib_vector_t*	nodes;		/*!< Nodes read from disk */
+
+	ib_alloc_t*	heap_alloc;	/*!< For handling all allocations */
+};
+
+/** Callback for reading and filtering nodes that are read from FTS index */
+struct fts_fetch_t {
+	void*		read_arg;	/*!< Arg for the sql_callback */
+
+	fts_sql_callback
+			read_record;	/*!< Callback for reading index
+					record */
+	size_t		total_memory;	/*!< Total memory used */
+};
+
+/** For horizontally splitting an FTS auxiliary index */
+struct fts_index_selector_t {
+	ulint		value;		/*!< Character value at which
+					to split */
+
+	const char*	suffix;		/*!< FTS aux index suffix */
+};
+
+/** This type represents a single document. */
+struct fts_doc_t {
+	fts_string_t	text;		/*!< document text */
+
+	ibool		found;		/*!< TRUE if the document was found
+					successfully in the database */
+
+	ib_rbt_t*	tokens;		/*!< This is filled when the document
+					is tokenized. Tokens; indexed by
+					fts_string_t*, cells are of type
+					fts_token_t* */
+
+	ib_alloc_t*	self_heap;	/*!< An instance of this type is
+					allocated from this heap along
+					with any objects that have the
+					same lifespan, most notably
+					the vector of token positions */
+	CHARSET_INFO*	charset;	/*!< Document's charset info */
+
+	st_mysql_ftparser* parser;	/*!< fts plugin parser */
+
+	ib_rbt_t*	stopwords;	/*!< Stopwords */
+};
+
+/** A token and its positions within a document. */
+struct fts_token_t {
+	fts_string_t	text;		/*!< token text */
+
+	ib_vector_t*	positions;	/*!< an array of the positions the
+					token is found in; each item is
+					actually an ulint. */
+};
+
+/** It's defined in fts/fts0fts.c */
+extern const fts_index_selector_t fts_index_selector[];
+
+/******************************************************************//**
+Compare two fts_trx_row_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_trx_row_doc_id_cmp(
+/*===================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2);			/*!< in: id2 */
+
+/******************************************************************//**
+Compare two fts_ranking_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_ranking_doc_id_cmp(
+/*===================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2);			/*!< in: id2 */
+
+/******************************************************************//**
+Compare two doc_ids. */
+UNIV_INLINE
+int fts_doc_id_cmp(
+/*==================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2);			/*!< in: id2 */
+
+/******************************************************************//**
+Duplicate a string. */
+UNIV_INLINE
+void
+fts_string_dup(
+/*===========*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	fts_string_t*		dst,		/*!< in: dup to here */
+	const fts_string_t*	src,		/*!< in: src string */
+	mem_heap_t*		heap);		/*!< in: heap to use */
+
+/******************************************************************//**
+Get the selected FTS aux INDEX suffix. */
+UNIV_INLINE
+const char*
+fts_get_suffix(
+/*===========*/
+	ulint		selected);		/*!< in: selected index */
+
+/** Select the FTS auxiliary index for the given character.
+@param[in]	cs	charset
+@param[in]	str	string
+@param[in]	len	string length in bytes
+@return the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index(
+	const CHARSET_INFO*	cs,
+	const byte*		str,
+	ulint			len);
+
+#include "fts0types.inl"
+
+#endif /* INNOBASE_FTS0TYPES_H */
diff --git a/storage/innobase/include/fts0types.inl b/storage/innobase/include/fts0types.inl
new file mode 100644
index 00000000..facc1e5c
--- /dev/null
+++ b/storage/innobase/include/fts0types.inl
@@ -0,0 +1,231 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0types.ic
+Full text search types.
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FTS0TYPES_IC
+#define INNOBASE_FTS0TYPES_IC
+
+/******************************************************************//**
+Duplicate a string.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+void
+fts_string_dup(
+/*===========*/
+	fts_string_t*		dst,		/*!< in: dup to here */
+	const fts_string_t*	src,		/*!< in: src string */
+	mem_heap_t*		heap)		/*!< in: heap to use */
+{
+	dst->f_str = (byte*)mem_heap_alloc(heap, src->f_len + 1);
+	memcpy(dst->f_str, src->f_str, src->f_len);
+
+	dst->f_len = src->f_len;
+	dst->f_str[src->f_len] = 0;
+	dst->f_n_char = src->f_n_char;
+}
+
+/******************************************************************//**
+Compare two fts_trx_row_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_row_doc_id_cmp(
+/*===================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const fts_trx_row_t*	tr1 = (const fts_trx_row_t*) p1;
+	const fts_trx_row_t*	tr2 = (const fts_trx_row_t*) p2;
+
+	return((int)(tr1->doc_id - tr2->doc_id));
+}
+
+/******************************************************************//**
+Compare two fts_ranking_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_ranking_doc_id_cmp(
+/*===================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const fts_ranking_t*	rk1 = (const fts_ranking_t*) p1;
+	const fts_ranking_t*	rk2 = (const fts_ranking_t*) p2;
+
+	return((int)(rk1->doc_id - rk2->doc_id));
+}
+
+/******************************************************************//**
+Compare two doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int fts_doc_id_cmp(
+/*==================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const doc_id_t*	up1 = static_cast<const doc_id_t*>(p1);
+	const doc_id_t*	up2 = static_cast<const doc_id_t*>(p2);
+
+	return static_cast<int>(*up1 - *up2);
+}
+
+/******************************************************************//**
+Get the first character's code position for FTS index partition */
+extern
+ulint
+innobase_strnxfrm(
+/*==============*/
+        const CHARSET_INFO*	cs,	/*!< in: Character set */
+        const uchar*		p2,	/*!< in: string */
+        const ulint		len2);	/*!< in: string length */
+
+/** Check if fts index charset is cjk
+@param[in]	cs	charset
+@retval	true	if the charset is cjk
+@retval	false	if not. */
+inline bool fts_is_charset_cjk(const CHARSET_INFO* cs)
+{
+	switch (cs->number) {
+	case 24: /* my_charset_gb2312_chinese_ci */
+	case 28: /* my_charset_gbk_chinese_ci */
+	case 1: /* my_charset_big5_chinese_ci */
+	case 12: /* my_charset_ujis_japanese_ci */
+	case 13: /* my_charset_sjis_japanese_ci */
+	case 95: /* my_charset_cp932_japanese_ci */
+	case 97: /* my_charset_eucjpms_japanese_ci */
+	case 19: /* my_charset_euckr_korean_ci */
+		return true;
+	default:
+		return false;
+	}
+}
+
+/** Select the FTS auxiliary index for the given character by range.
+@param[in]	cs	charset
+@param[in]	str	string
+@param[in]	len	string length
+@retval	the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index_by_range(
+	const CHARSET_INFO*	cs,
+	const byte*		str,
+	ulint			len)
+{
+	ulint			selected = 0;
+	ulint			value = innobase_strnxfrm(cs, str, len);
+
+	while (fts_index_selector[selected].value != 0) {
+
+		if (fts_index_selector[selected].value == value) {
+
+			return(selected);
+
+		} else if (fts_index_selector[selected].value > value) {
+
+			return(selected > 0 ? selected - 1 : 0);
+		}
+
+		++selected;
+	}
+
+	ut_ad(selected > 1);
+
+	return(selected - 1);
+}
+
+/** Select the FTS auxiliary index for the given character by hash.
+@param[in]	cs	charset
+@param[in]	str	string
+@param[in]	len	string length
+@retval the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index_by_hash(
+	const CHARSET_INFO*	cs,
+	const byte*		str,
+	ulint			len)
+{
+	ulong	nr1 = 1;
+	ulong	nr2 = 4;
+
+	ut_ad(!(str == NULL && len > 0));
+
+	if (str == NULL || len == 0) {
+		return 0;
+	}
+
+	/* Get the first char */
+	/* JAN: TODO: MySQL 5.7 had
+	char_len = my_mbcharlen_ptr(cs, reinterpret_cast<const char*>(str),
+				    reinterpret_cast<const char*>(str + len));
+	*/
+	size_t char_len = size_t(cs->charlen(str, str + len));
+
+	ut_ad(char_len <= len);
+
+	/* Get collation hash code */
+	my_ci_hash_sort(cs, str, char_len, &nr1, &nr2);
+
+	return(nr1 % FTS_NUM_AUX_INDEX);
+}
+
+/** Select the FTS auxiliary index for the given character.
+@param[in]	cs	charset
+@param[in]	str	string
+@param[in]	len	string length in bytes
+@retval	the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index(
+	const CHARSET_INFO*	cs,
+	const byte*		str,
+	ulint			len)
+{
+	ulint	selected;
+
+	if (fts_is_charset_cjk(cs)) {
+		selected = fts_select_index_by_hash(cs, str, len);
+	} else {
+		selected = fts_select_index_by_range(cs, str, len);
+	}
+
+	return(selected);
+}
+
+/******************************************************************//**
+Return the selected FTS aux index suffix. */
+UNIV_INLINE
+const char*
+fts_get_suffix(
+/*===========*/
+	ulint		selected)	/*!< in: selected index */
+{
+	return(fts_index_selector[selected].suffix);
+}
+
+#endif /* INNOBASE_FTS0TYPES_IC */
diff --git a/storage/innobase/include/fts0vlc.h b/storage/innobase/include/fts0vlc.h
new file mode 100644
index 00000000..d6e60377
--- /dev/null
+++ b/storage/innobase/include/fts0vlc.h
@@ -0,0 +1,124 @@
+/**
+
+Copyright (c) 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+**/
+/**
+@file include/fts0vlc.h
+Full text variable length integer encoding/decoding.
+
+Created 2021-10-19 Thirunarayanan Balathandayuthapani
+**/
+
+/** Return length of val if it were encoded using our VLC scheme.
+@param	val	value to encode
+@return length of value encoded, in bytes */
+inline size_t fts_get_encoded_len(doc_id_t val)
+{
+  if (val < static_cast<doc_id_t>(1) << 7)
+    return 1;
+  if (val < static_cast<doc_id_t>(1) << 14)
+    return 2;
+  if (val < static_cast<doc_id_t>(1) << 21)
+    return 3;
+  if (val < static_cast<doc_id_t>(1) << 28)
+    return 4;
+  if (val < static_cast<doc_id_t>(1) << 35)
+    return 5;
+  if (val < static_cast<doc_id_t>(1) << 42)
+    return 6;
+  if (val < static_cast<doc_id_t>(1) << 49)
+    return 7;
+  if (val < static_cast<doc_id_t>(1) << 56)
+    return 8;
+  if (val < static_cast<doc_id_t>(1) << 63)
+    return 9;
+  return 10;
+}
+
+/** Encode an integer using our VLC scheme and return the
+length in bytes.
+@param	val	value to encode
+@param	buf	buffer, must have enough space
+@return length of value encoded, in bytes */
+inline byte *fts_encode_int(doc_id_t val, byte *buf)
+{
+  if (val < static_cast<doc_id_t>(1) << 7)
+    goto add_1;
+  if (val < static_cast<doc_id_t>(1) << 14)
+    goto add_2;
+  if (val < static_cast<doc_id_t>(1) << 21)
+    goto add_3;
+  if (val < static_cast<doc_id_t>(1) << 28)
+    goto add_4;
+  if (val < static_cast<doc_id_t>(1) << 35)
+    goto add_5;
+  if (val < static_cast<doc_id_t>(1) << 42)
+    goto add_6;
+  if (val < static_cast<doc_id_t>(1) << 49)
+    goto add_7;
+  if (val < static_cast<doc_id_t>(1) << 56)
+    goto add_8;
+  if (val < static_cast<doc_id_t>(1) << 63)
+    goto add_9;
+
+  *buf++= static_cast<byte>(val >> 63);
+add_9:
+  *buf++= static_cast<byte>(val >> 56) & 0x7F;
+add_8:
+  *buf++= static_cast<byte>(val >> 49) & 0x7F;
+add_7:
+  *buf++= static_cast<byte>(val >> 42) & 0x7F;
+add_6:
+  *buf++= static_cast<byte>(val >> 35) & 0x7F;
+add_5:
+  *buf++= static_cast<byte>(val >> 28) & 0x7F;
+add_4:
+  *buf++= static_cast<byte>(val >> 21) & 0x7F;
+add_3:
+  *buf++= static_cast<byte>(val >> 14) & 0x7F;
+add_2:
+  *buf++= static_cast<byte>(val >> 7) & 0x7F;
+add_1:
+  *buf++= static_cast<byte>(val) | 0x80;
+  return buf;
+}
+
+/** Decode and return the integer that was encoded using
+our VLC scheme.
+@param	ptr 	pointer to decode from, this ptr is
+		incremented by the number of bytes decoded
+@return value decoded */
+inline doc_id_t fts_decode_vlc(const byte **ptr)
+{
+  ut_d(const byte *const start= *ptr);
+  ut_ad(*start);
+
+  doc_id_t val= 0;
+  for (;;)
+  {
+    byte b= *(*ptr)++;
+    val|= (b & 0x7F);
+
+    /* High-bit on means "last byte in the encoded integer". */
+    if (b & 0x80)
+      break;
+    ut_ad(val < static_cast<doc_id_t>(1) << (64 - 7));
+    val <<= 7;
+  }
+
+  ut_ad(*ptr - start <= 10);
+
+  return(val);
+}
diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h
new file mode 100644
index 00000000..746dab80
--- /dev/null
+++ b/storage/innobase/include/fut0lst.h
@@ -0,0 +1,156 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fut0lst.h
+File-based list utilities
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#pragma once
+
+/* The physical size of a list base node in bytes */
+#define	FLST_BASE_NODE_SIZE	(4 + 2 * FIL_ADDR_SIZE)
+/* The physical size of a list node in bytes */
+#define	FLST_NODE_SIZE		(2 * FIL_ADDR_SIZE)
+
+#ifdef UNIV_INNOCHECKSUM
+# include "fil0fil.h"
+#else
+# include "mtr0log.h"
+
+typedef	byte	flst_base_node_t;
+typedef	byte	flst_node_t;
+
+/* We define the field offsets of a node for the list */
+#define FLST_PREV	0	/* 6-byte address of the previous list element;
+				the page part of address is FIL_NULL, if no
+				previous element */
+#define FLST_NEXT	FIL_ADDR_SIZE	/* 6-byte address of the next
+				list element; the page part of address
+				is FIL_NULL, if no next element */
+
+/* We define the field offsets of a base node for the list */
+#define FLST_LEN	0	/* 32-bit list length field */
+#define	FLST_FIRST	4	/* 6-byte address of the first element
+				of the list; undefined if empty list */
+#define	FLST_LAST	(4 + FIL_ADDR_SIZE) /* 6-byte address of the
+				last element of the list; undefined
+				if empty list */
+
+/** Initialize a zero-initialized list base node.
+@param[in,out]	block	file page
+@param[in]	ofs	byte offset of the list base node
+@param[in,out]	mtr	mini-transaction */
+inline void flst_init(const buf_block_t* block, uint16_t ofs, mtr_t* mtr)
+{
+  ut_d(const page_t *page= block->page.frame);
+  ut_ad(!mach_read_from_2(FLST_LEN + ofs + page));
+  ut_ad(!mach_read_from_2(FLST_FIRST + FIL_ADDR_BYTE + ofs + page));
+  ut_ad(!mach_read_from_2(FLST_LAST + FIL_ADDR_BYTE + ofs + page));
+  compile_time_assert(FIL_NULL == 0xffU * 0x1010101U);
+  mtr->memset(block, FLST_FIRST + FIL_ADDR_PAGE + ofs, 4, 0xff);
+  mtr->memset(block, FLST_LAST + FIL_ADDR_PAGE + ofs, 4, 0xff);
+}
+
+/** Initialize a list base node.
+@param[in]      block   file page
+@param[in,out]  base    base node
+@param[in,out]  mtr     mini-transaction */
+void flst_init(const buf_block_t &block, byte *base, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
+
+/** Append a file list node to a list.
+@param[in,out]  base    base node block
+@param[in]      boffset byte offset of the base node
+@param[in,out]  add     block to be added
+@param[in]      aoffset byte offset of the node to be added
+@param[in,out]  mtr     mini-transaction
+@return error code */
+dberr_t flst_add_last(buf_block_t *base, uint16_t boffset,
+                      buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Prepend a file list node to a list.
+@param[in,out]  base    base node block
+@param[in]      boffset byte offset of the base node
+@param[in,out]  add     block to be added
+@param[in]      aoffset byte offset of the node to be added
+@param[in,out]  mtr     mini-transaction
+@return error code */
+dberr_t flst_add_first(buf_block_t *base, uint16_t boffset,
+                    buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Remove a file list node.
+@param[in,out]  base    base node block
+@param[in]      boffset byte offset of the base node
+@param[in,out]  cur     block to be removed
+@param[in]      coffset byte offset of the current record to be removed
+@param[in,out]  mtr     mini-transaction
+@return error code */
+dberr_t flst_remove(buf_block_t *base, uint16_t boffset,
+                    buf_block_t *cur, uint16_t coffset, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** @return the length of a list */
+inline uint32_t flst_get_len(const flst_base_node_t *base)
+{
+  return mach_read_from_4(base + FLST_LEN);
+}
+
+/** @return a file address */
+inline fil_addr_t flst_read_addr(const byte *faddr)
+{
+  fil_addr_t addr= { mach_read_from_4(faddr + FIL_ADDR_PAGE),
+		     mach_read_from_2(faddr + FIL_ADDR_BYTE) };
+  ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA);
+  ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA);
+  return addr;
+}
+
+/** @return list first node address */
+inline fil_addr_t flst_get_first(const flst_base_node_t *base)
+{
+  return flst_read_addr(base + FLST_FIRST);
+}
+
+/** @return list last node address */
+inline fil_addr_t flst_get_last(const flst_base_node_t *base)
+{
+  return flst_read_addr(base + FLST_LAST);
+}
+
+/** @return list next node address */
+inline fil_addr_t flst_get_next_addr(const flst_node_t* node)
+{
+  return flst_read_addr(node + FLST_NEXT);
+}
+
+/** @return list prev node address */
+inline fil_addr_t flst_get_prev_addr(const flst_node_t *node)
+{
+  return flst_read_addr(node + FLST_PREV);
+}
+
+# ifdef UNIV_DEBUG
+/** Validate a file-based list. */
+void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr);
+# endif
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/gis0geo.h b/storage/innobase/include/gis0geo.h
new file mode 100644
index 00000000..3fd01a3a
--- /dev/null
+++ b/storage/innobase/include/gis0geo.h
@@ -0,0 +1,122 @@
+/*****************************************************************************
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software Foundation,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+*****************************************************************************/
+
+/**************************************************//**
+@file gis0geo.h
+The r-tree define from MyISAM
+*******************************************************/
+
+#ifndef _gis0geo_h
+#define _gis0geo_h
+
+#include "my_global.h"
+#include "string.h"
+
+#define SPTYPE HA_KEYTYPE_DOUBLE
+#define SPLEN  8
+
+/* Since the mbr could be a point or a linestring, in this case, area of
+mbr is 0. So, we define this macro for calculating the area increasing
+when we need to enlarge the mbr. */
+#define LINE_MBR_WEIGHTS	0.001
+
+/* Types of "well-known binary representation" (wkb) format. */
+enum wkbType
+{
+  wkbPoint = 1,
+  wkbLineString = 2,
+  wkbPolygon = 3,
+  wkbMultiPoint = 4,
+  wkbMultiLineString = 5,
+  wkbMultiPolygon = 6,
+  wkbGeometryCollection = 7
+};
+
+/* Byte order of "well-known binary representation" (wkb) format. */
+enum wkbByteOrder
+{
+  wkbXDR = 0,    /* Big Endian    */
+  wkbNDR = 1     /* Little Endian */
+};
+
+/*************************************************************//**
+Calculate minimal bounding rectangle (mbr) of the spatial object
+stored in "well-known binary representation" (wkb) format.
+@return 0 if ok */
+int
+rtree_mbr_from_wkb(
+/*===============*/
+	const uchar*	wkb,		/*!< in: pointer to wkb. */
+	uint	size,		/*!< in: size of wkb. */
+	uint	n_dims,		/*!< in: dimensions. */
+	double*	mbr);		/*!< in/out: mbr. */
+
+/* Rtree split node structure. */
+struct rtr_split_node_t
+{
+	double	square;		/* square of the mbr.*/
+	int	n_node;		/* which group in.*/
+	uchar*	key;		/* key. */
+	double* coords;		/* mbr. */
+};
+
+/*************************************************************//**
+Inline function for reserving coords */
+inline
+static
+double*
+reserve_coords(double	**d_buffer,	/*!< in/out: buffer. */
+	       int	n_dim)		/*!< in: dimensions. */
+/*===========*/
+{
+  double *coords = *d_buffer;
+  (*d_buffer) += n_dim * 2;
+  return coords;
+}
+
+/*************************************************************//**
+Split rtree nodes.
+Return which group the first rec is in.  */
+int
+split_rtree_node(
+/*=============*/
+	rtr_split_node_t*	node,		/*!< in: split nodes.*/
+	int			n_entries,	/*!< in: entries number.*/
+	int			all_size,	/*!< in: total key's size.*/
+	int			key_size,	/*!< in: key's size.*/
+	int			min_size,	/*!< in: minimal group size.*/
+	int			size1,		/*!< in: size of group.*/
+	int			size2,		/*!< in: initial group sizes */
+	double**		d_buffer,	/*!< in/out: buffer.*/
+	int			n_dim,		/*!< in: dimensions. */
+	uchar*			first_rec);	/*!< in: the first rec. */
+
+/** Compare two minimum bounding rectangles.
+@param mode   comparison operator
+   MBR_INTERSECT(a,b)  a overlaps b
+   MBR_CONTAIN(a,b)    a contains b
+   MBR_DISJOINT(a,b)   a disjoint b
+   MBR_WITHIN(a,b)     a within   b
+   MBR_EQUAL(a,b)      All coordinates of MBRs are equal
+   MBR_DATA(a,b)       Data reference is the same
+@param b first MBR
+@param a second MBR
+@retval 0 if the predicate holds
+@retval 1 if the precidate does not hold */
+int rtree_key_cmp(page_cur_mode_t mode, const void *b, const void *a);
+#endif
diff --git a/storage/innobase/include/gis0rtree.h b/storage/innobase/include/gis0rtree.h
new file mode 100644
index 00000000..b07261ce
--- /dev/null
+++ b/storage/innobase/include/gis0rtree.h
@@ -0,0 +1,513 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include gis0rtree.h
+R-tree header file
+
+Created 2013/03/27 Jimmy Yang and Allen Lai
+***********************************************************************/
+
+#ifndef gis0rtree_h
+#define gis0rtree_h
+
+#include "btr0cur.h"
+#include "rem0types.h"
+
+/* Whether MBR 'a' contains 'b' */
+#define	MBR_CONTAIN_CMP(a, b)					\
+	((((b)->xmin >= (a)->xmin) && ((b)->xmax <= (a)->xmax)	\
+	 && ((b)->ymin >= (a)->ymin) && ((b)->ymax <= (a)->ymax)))
+
+/* Whether MBR 'a' equals to 'b' */
+#define	MBR_EQUAL_CMP(a, b)					\
+	((((b)->xmin == (a)->xmin) && ((b)->xmax == (a)->xmax))	\
+	 && (((b)->ymin == (a)->ymin) && ((b)->ymax == (a)->ymax)))
+
+/* Whether MBR 'a' intersects 'b' */
+#define	MBR_INTERSECT_CMP(a, b)					\
+	((((b)->xmin <= (a)->xmax) || ((b)->xmax >= (a)->xmin))	\
+	 && (((b)->ymin <= (a)->ymax) || ((b)->ymax >= (a)->ymin)))
+
+/* Whether MBR 'a' and 'b' disjoint */
+#define	MBR_DISJOINT_CMP(a, b)	(!MBR_INTERSECT_CMP(a, b))
+
+/* Whether MBR 'a' within 'b' */
+#define	MBR_WITHIN_CMP(a, b)					\
+	((((b)->xmin <= (a)->xmin) && ((b)->xmax >= (a)->xmax))	\
+	 && (((b)->ymin <= (a)->ymin) && ((b)->ymax >= (a)->ymax)))
+
+/* Define it for rtree search mode checking. */
+#define RTREE_SEARCH_MODE(mode)					\
+	(((mode) >= PAGE_CUR_CONTAIN) && ((mode <= PAGE_CUR_RTREE_GET_FATHER)))
+
+/* Geometry data header */
+#define	GEO_DATA_HEADER_SIZE	4
+
+/** Search for a spatial index leaf page record.
+@param cur         cursor
+@param tuple       search tuple
+@param latch_mode  latching mode
+@param mtr         mini-transaction
+@param mode        search mode */
+dberr_t rtr_search_leaf(btr_cur_t *cur, const dtuple_t *tuple,
+                        btr_latch_mode latch_mode, mtr_t *mtr,
+                        page_cur_mode_t mode= PAGE_CUR_RTREE_LOCATE)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Search for inserting a spatial index leaf page record.
+@param cur         cursor
+@param tuple       search tuple
+@param latch_mode  latching mode
+@param mtr         mini-transaction */
+inline dberr_t rtr_insert_leaf(btr_cur_t *cur, const dtuple_t *tuple,
+                               btr_latch_mode latch_mode, mtr_t *mtr)
+{
+  return rtr_search_leaf(cur, tuple, latch_mode, mtr, PAGE_CUR_RTREE_INSERT);
+}
+
+/** Search for a spatial index leaf page record.
+@param pcur         cursor
+@param tuple       search tuple
+@param mode        search mode
+@param mtr         mini-transaction */
+dberr_t rtr_search_leaf(btr_pcur_t *pcur, const dtuple_t *tuple,
+                        page_cur_mode_t mode, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple,
+                                page_cur_mode_t mode,
+                                btr_latch_mode latch_mode,
+                                btr_cur_t *cur, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**********************************************************************//**
+Builds a Rtree node pointer out of a physical record and a page number.
+@return own: node pointer */
+dtuple_t*
+rtr_index_build_node_ptr(
+/*=====================*/
+	const dict_index_t*	index,	/*!< in: index */
+	const rtr_mbr_t*	mbr,	/*!< in: mbr of lower page */
+	const rec_t*		rec,	/*!< in: record for which to build node
+					pointer */
+	ulint			page_no,/*!< in: page number to put in node
+					pointer */
+	mem_heap_t*		heap);	/*!< in: memory heap where pointer
+					created */
+
+/*************************************************************//**
+Splits an R-tree index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
+released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore enough
+free disk space (2 pages) must be guaranteed to be available before
+this function is called.
+@return inserted record */
+rec_t*
+rtr_page_split_and_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in/out: cursor at which to insert; when the
+				function returns, the cursor is positioned
+				on the predecessor of the inserted record */
+	rec_offs**	offsets,/*!< out: offsets on inserted record */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr,	/*!< in: mtr */
+	dberr_t*	err);	/*!< out: error code */
+
+/**************************************************************//**
+Sets the child node mbr in a node pointer. */
+UNIV_INLINE
+void
+rtr_page_cal_mbr(
+/*=============*/
+	const dict_index_t*	index,	/*!< in: index */
+	const buf_block_t*	block,	/*!< in: buffer block */
+	rtr_mbr_t*		mbr,	/*!< out: MBR encapsulates the page */
+	mem_heap_t*		heap);	/*!< in: heap for the memory
+					allocation */
+/*************************************************************//**
+Find the next matching record. This function will first exhaust
+the copied record listed in the rtr_info->matches vector before
+moving to next page
+@return true if there is next qualified record found, otherwise(if
+exhausted) false */
+bool
+rtr_pcur_move_to_next(
+/*==================*/
+	const dtuple_t*	tuple,	/*!< in: data tuple; NOTE: n_fields_cmp in
+				tuple must be set so that it cannot get
+				compared to the node ptr page number field! */
+	page_cur_mode_t	mode,	/*!< in: cursor search mode */
+	btr_pcur_t*	cursor, /*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	ulint		cur_level,
+				/*!< in: current level */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/****************************************************************//**
+Searches the right position in rtree for a page cursor. */
+bool
+rtr_cur_search_with_match(
+/*======================*/
+	const buf_block_t*	block,	/*!< in: buffer block */
+	dict_index_t*		index,	/*!< in: index descriptor */
+	const dtuple_t*		tuple,	/*!< in: data tuple */
+	page_cur_mode_t		mode,	/*!< in: PAGE_CUR_L,
+					PAGE_CUR_LE, PAGE_CUR_G, or
+					PAGE_CUR_GE */
+	page_cur_t*		cursor,	/*!< in/out: page cursor */
+	rtr_info_t*		rtr_info);/*!< in/out: search stack */
+
+/****************************************************************//**
+Calculate the area increased for a new record
+@return area increased */
+double
+rtr_rec_cal_increase(
+/*=================*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple to insert, which
+				cause area increase */
+	const rec_t*	rec,	/*!< in: physical record which differs from
+				dtuple in some of the common fields, or which
+				has an equal number or more fields than
+				dtuple */
+	double*		area);	/*!< out: increased area */
+
+/****************************************************************//**
+Following the right link to find the proper block for insert.
+@return the proper block.*/
+dberr_t
+rtr_ins_enlarge_mbr(
+/*=================*/
+	btr_cur_t*		cursor,	/*!< in: btr cursor */
+	mtr_t*			mtr);	/*!< in: mtr */
+
+/**************************************************************//**
+push a nonleaf index node to the search path */
+UNIV_INLINE
+void
+rtr_non_leaf_stack_push(
+/*====================*/
+	rtr_node_path_t*	path,		/*!< in/out: search path */
+	uint32_t		pageno,		/*!< in: pageno to insert */
+	node_seq_t		seq_no,		/*!< in: Node sequence num */
+	ulint			level,		/*!< in: index level */
+	uint32_t		child_no,	/*!< in: child page no */
+	btr_pcur_t*		cursor,		/*!< in: position cursor */
+	double			mbr_inc);	/*!< in: MBR needs to be
+						enlarged */
+
+/**************************************************************//**
+push a nonleaf index node to the search path for insertion */
+void
+rtr_non_leaf_insert_stack_push(
+/*===========================*/
+	dict_index_t*		index,		/*!< in: index descriptor */
+	rtr_node_path_t*	path,		/*!< in/out: search path */
+	ulint			level,		/*!< in: index level */
+	const buf_block_t*	block,		/*!< in: block of the page */
+	const rec_t*		rec,		/*!< in: positioned record */
+	double			mbr_inc);	/*!< in: MBR needs to be
+						enlarged */
+
+#define rtr_get_new_ssn_id(index) (index)->assign_ssn()
+#define rtr_get_current_ssn_id(index) (index)->ssn()
+
+/********************************************************************//**
+Create a RTree search info structure */
+rtr_info_t*
+rtr_create_rtr_info(
+/******************/
+	bool		need_prdt,	/*!< in: Whether predicate lock is
+					needed */
+	bool		init_matches,	/*!< in: Whether to initiate the
+					"matches" structure for collecting
+					matched leaf records */
+	btr_cur_t*	cursor,		/*!< in: tree search cursor */
+	dict_index_t*	index);		/*!< in: index struct */
+
+/********************************************************************//**
+Update a btr_cur_t with rtr_info */
+void
+rtr_info_update_btr(
+/******************/
+	btr_cur_t*	cursor,		/*!< in/out: tree cursor */
+	rtr_info_t*	rtr_info);	/*!< in: rtr_info to set to the
+					cursor */
+
+/********************************************************************//**
+Update a btr_cur_t with rtr_info */
+void
+rtr_init_rtr_info(
+/****************/
+	rtr_info_t*	rtr_info,	/*!< in: rtr_info to set to the
+					cursor */
+	bool		need_prdt,	/*!< in: Whether predicate lock is
+					needed */
+	btr_cur_t*	cursor,		/*!< in: tree search cursor */
+	dict_index_t*	index,		/*!< in: index structure */
+	bool		reinit);	/*!< in: Whether this is a reinit */
+
+/**************************************************************//**
+Clean up Rtree cursor */
+void
+rtr_clean_rtr_info(
+/*===============*/
+	rtr_info_t*	rtr_info,	/*!< in: RTree search info */
+	bool		free_all);	/*!< in: need to free rtr_info itself */
+
+/****************************************************************//**
+Get the bounding box content from an index record*/
+void
+rtr_get_mbr_from_rec(
+/*=================*/
+	const rec_t*	rec,	/*!< in: data tuple */
+	const rec_offs*	offsets,/*!< in: offsets array */
+	rtr_mbr_t*	mbr);	/*!< out MBR */
+
+/****************************************************************//**
+Get the bounding box content from a MBR data record */
+void
+rtr_get_mbr_from_tuple(
+/*===================*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	rtr_mbr*	mbr);	/*!< out: mbr to fill */
+
+/* Get the rtree page father.
+@param[in,out]	mtr		mtr
+@param[in]	sea_cur		search cursor, contains information
+				about parent nodes in search
+@param[in,out]	cursor		cursor on node pointer record,
+				its page x-latched
+@return whether the cursor was successfully positioned */
+bool rtr_page_get_father(mtr_t *mtr, btr_cur_t *sea_cur, btr_cur_t *cursor)
+  MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
+
+/************************************************************//**
+Returns the father block to a page. It is assumed that mtr holds
+an X or SX latch on the tree.
+@return rec_get_offsets() of the node pointer record */
+rec_offs*
+rtr_page_get_father_block(
+/*======================*/
+	rec_offs*	offsets,/*!< in: work area for the return value */
+	mem_heap_t*	heap,	/*!< in: memory heap to use */
+	mtr_t*		mtr,	/*!< in: mtr */
+	btr_cur_t*	sea_cur,/*!< in: search cursor, contains information
+				about parent nodes in search */
+	btr_cur_t*	cursor);/*!< out: cursor on node pointer record,
+				its page x-latched */
+/**************************************************************//**
+Store the parent path cursor
+@return number of cursor stored */
+ulint
+rtr_store_parent_path(
+/*==================*/
+	const buf_block_t*	block,	/*!< in: block of the page */
+	btr_cur_t*		btr_cur,/*!< in/out: persistent cursor */
+	btr_latch_mode		latch_mode,
+					/*!< in: latch_mode */
+	ulint			level,	/*!< in: index level */
+	mtr_t*			mtr);	/*!< in: mtr */
+
+/**************************************************************//**
+Initializes and opens a persistent cursor to an index tree. It should be
+closed with btr_pcur_close. */
+bool rtr_search(
+	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
+	btr_latch_mode	latch_mode,/*!< in: BTR_MODIFY_LEAF, ... */
+	btr_pcur_t*	cursor,	/*!< in: memory buffer for persistent cursor */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************//**
+Returns the R-Tree node stored in the parent search path
+@return pointer to R-Tree cursor component */
+UNIV_INLINE
+node_visit_t*
+rtr_get_parent_node(
+/*================*/
+	btr_cur_t*	btr_cur,	/*!< in: persistent cursor */
+	ulint		level,		/*!< in: index level of buffer page */
+	ulint		is_insert);	/*!< in: whether it is insert */
+
+/*********************************************************//**
+Returns the R-Tree cursor stored in the parent search path
+@return pointer to R-Tree cursor component */
+UNIV_INLINE
+btr_pcur_t*
+rtr_get_parent_cursor(
+/*==================*/
+	btr_cur_t*	btr_cur,	/*!< in: persistent cursor */
+	ulint		level,		/*!< in: index level of buffer page */
+	ulint		is_insert);	/*!< in: whether insert operation */
+
+MY_ATTRIBUTE((warn_unused_result))
+/*************************************************************//**
+Copy recs from a page to new_block of rtree.
+
+@return error code */
+dberr_t
+rtr_page_copy_rec_list_end_no_locks(
+/*================================*/
+	buf_block_t*	new_block,	/*!< in: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page of rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	rtr_rec_move_t*	rec_move,	/*!< in: recording records moved */
+	ulint		max_move,	/*!< in: num of rec to move */
+	ulint*		num_moved,	/*!< out: num of rec to move */
+	mtr_t*		mtr);		/*!< in: mtr */
+
+MY_ATTRIBUTE((warn_unused_result))
+/*************************************************************//**
+Copy recs till a specified rec from a page to new_block of rtree.
+
+@return error code */
+dberr_t
+rtr_page_copy_rec_list_start_no_locks(
+/*==================================*/
+	buf_block_t*	new_block,	/*!< in: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page of rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	rtr_rec_move_t*	rec_move,	/*!< in: recording records moved */
+	ulint		max_move,	/*!< in: num of rec to move */
+	ulint*		num_moved,	/*!< out: num of rec to move */
+	mtr_t*		mtr);		/*!< in: mtr */
+
+/****************************************************************//**
+Merge 2 mbrs and update the the mbr that cursor is on. */
+void
+rtr_merge_and_update_mbr(
+/*=====================*/
+	btr_cur_t*		cursor,		/*!< in/out: cursor */
+	btr_cur_t*		cursor2,	/*!< in: the other cursor */
+	rec_offs*		offsets,	/*!< in: rec offsets */
+	rec_offs*		offsets2,	/*!< in: rec offsets */
+	page_t*			child_page,	/*!< in: the child page. */
+	mtr_t*			mtr);		/*!< in: mtr */
+
+/*************************************************************//**
+Deletes on the upper level the node pointer to a page. */
+void
+rtr_node_ptr_delete(
+/*================*/
+	btr_cur_t*	cursor,	/*!< in: search cursor, contains information
+				about parent nodes in search */
+	mtr_t*		mtr);	/*!< in: mtr */
+
+/****************************************************************//**
+Check two MBRs are identical or need to be merged */
+bool
+rtr_merge_mbr_changed(
+/*==================*/
+	btr_cur_t*	cursor,		/*!< in: cursor */
+	btr_cur_t*	cursor2,	/*!< in: the other cursor */
+	rec_offs*	offsets,	/*!< in: rec offsets */
+	rec_offs*	offsets2,	/*!< in: rec offsets */
+	rtr_mbr_t*	new_mbr);	/*!< out: MBR to update */
+
+
+/**************************************************************//**
+Update the mbr field of a spatial index row. */
+void
+rtr_update_mbr_field(
+/*=================*/
+	btr_cur_t*	cursor,		/*!< in: cursor pointed to rec.*/
+	rec_offs*	offsets,	/*!< in: offsets on rec. */
+	btr_cur_t*	cursor2,	/*!< in/out: cursor pointed to rec
+					that should be deleted.
+					this cursor is for btr_compress to
+					delete the merged page's father rec.*/
+	page_t*		child_page,	/*!< in: child page. */
+	rtr_mbr_t*	new_mbr,	/*!< in: the new mbr. */
+	rec_t*		new_rec,	/*!< in: rec to use */
+	mtr_t*		mtr);		/*!< in: mtr */
+
+/**************************************************************//**
+Check whether a Rtree page is child of a parent page
+@return true if there is child/parent relationship */
+bool
+rtr_check_same_block(
+/*=================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	btr_cur_t*	cur,	/*!< in/out: position at the parent entry
+				pointing to the child if successful */
+	buf_block_t*	parentb,/*!< in: parent page to check */
+	mem_heap_t*	heap);	/*!< in: memory heap */
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+rtr_write_mbr(
+/*==========*/
+	byte*			data,	/*!< out: data */
+	const rtr_mbr_t*	mbr);	/*!< in: data */
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+rtr_read_mbr(
+/*==========*/
+	const byte*		data,	/*!< in: data */
+	rtr_mbr_t*		mbr);	/*!< out: data */
+
+/**************************************************************//**
+Check whether a discarding page is in anyone's search path */
+void
+rtr_check_discard_page(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index */
+	btr_cur_t*	cursor,	/*!< in: cursor on the page to discard: not on
+				the root page */
+	buf_block_t*	block);	/*!< in: block of page to be discarded */
+
+/********************************************************************//**
+Reinitialize a RTree search info */
+UNIV_INLINE
+void
+rtr_info_reinit_in_cursor(
+/************************/
+	btr_cur_t*	cursor,		/*!< in/out: tree cursor */
+	dict_index_t*	index,		/*!< in: index struct */
+	bool		need_prdt);	/*!< in: Whether predicate lock is
+					needed */
+
+/** Estimates the number of rows in a given area.
+@param[in]	index	index
+@param[in]	tuple	range tuple containing mbr, may also be empty tuple
+@param[in]	mode	search mode
+@return estimated number of rows */
+ha_rows
+rtr_estimate_n_rows_in_range(
+	dict_index_t*	index,
+	const dtuple_t*	tuple,
+	page_cur_mode_t	mode);
+
+#include "gis0rtree.inl"
+#endif /*!< gis0rtree.h */
diff --git a/storage/innobase/include/gis0rtree.inl b/storage/innobase/include/gis0rtree.inl
new file mode 100644
index 00000000..5101eeb6
--- /dev/null
+++ b/storage/innobase/include/gis0rtree.inl
@@ -0,0 +1,245 @@
+/*****************************************************************************
+
+Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include gis0rtree.h
+R-tree Inline code
+
+Created 2013/03/27 Jimmy Yang and Allen Lai
+***********************************************************************/
+
+/**************************************************************//**
+Sets the child node mbr in a node pointer. */
+UNIV_INLINE
+void
+rtr_page_cal_mbr(
+/*=============*/
+	const dict_index_t*	index,	/*!< in: index */
+	const buf_block_t*	block,	/*!< in: buffer block */
+	rtr_mbr_t*		rtr_mbr,/*!< out: MBR encapsulates the page */
+	mem_heap_t*		heap)	/*!< in: heap for the memory
+					allocation */
+{
+	page_t*		page;
+	rec_t*		rec;
+	const byte*	field;
+	ulint		len;
+	rec_offs*	offsets = NULL;
+	double		bmin, bmax;
+	double*		amin;
+	double*		amax;
+	ulint		inc = 0;
+	double*		mbr;
+
+	rtr_mbr->xmin = DBL_MAX;
+	rtr_mbr->ymin = DBL_MAX;
+	rtr_mbr->xmax = -DBL_MAX;
+	rtr_mbr->ymax = -DBL_MAX;
+
+	mbr = reinterpret_cast<double*>(rtr_mbr);
+
+	page = buf_block_get_frame(block);
+
+	rec = page_rec_get_next(page_get_infimum_rec(page));
+	if (UNIV_UNLIKELY(!rec)) {
+		return;
+	}
+	offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page)
+				  ? index->n_fields : 0,
+				  ULINT_UNDEFINED, &heap);
+
+	do {
+		/* The mbr address is in the first field. */
+		field = rec_get_nth_field(rec, offsets, 0, &len);
+
+		ut_ad(len == DATA_MBR_LEN);
+		inc = 0;
+		for (unsigned i = 0; i < SPDIMS; i++) {
+			bmin = mach_double_read(field + inc);
+			bmax = mach_double_read(field + inc + sizeof(double));
+
+			amin = mbr + i * SPDIMS;
+			amax = mbr + i * SPDIMS + 1;
+
+			if (*amin > bmin)
+				*amin = bmin;
+			if (*amax < bmax)
+				*amax = bmax;
+
+			inc += 2 * sizeof(double);
+		}
+
+		rec = page_rec_get_next(rec);
+
+		if (rec == NULL) {
+			break;
+		}
+	} while (!page_rec_is_supremum(rec));
+}
+
+/**************************************************************//**
+push a nonleaf index node to the search path */
+UNIV_INLINE
+void
+rtr_non_leaf_stack_push(
+/*====================*/
+	rtr_node_path_t*	path,		/*!< in/out: search path */
+	uint32_t		pageno,		/*!< in: pageno to insert */
+	node_seq_t		seq_no,		/*!< in: Node sequence num */
+	ulint			level,		/*!< in: index page level */
+	uint32_t		child_no,	/*!< in: child page no */
+	btr_pcur_t*		cursor,		/*!< in: position cursor */
+	double			mbr_inc)	/*!< in: MBR needs to be
+						enlarged */
+{
+	node_visit_t	insert_val;
+
+	insert_val.page_no = pageno;
+	insert_val.seq_no = seq_no;
+	insert_val.level = level;
+	insert_val.child_no = child_no;
+	insert_val.cursor = cursor;
+	insert_val.mbr_inc = mbr_inc;
+
+	path->push_back(insert_val);
+
+#ifdef RTR_SEARCH_DIAGNOSTIC
+	fprintf(stderr, "INNODB_RTR: Push page %d, level %d, seq %d"
+			" to search stack \n",
+		static_cast<int>(pageno), static_cast<int>(level),
+		static_cast<int>(seq_no));
+#endif /* RTR_SEARCH_DIAGNOSTIC */
+}
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+rtr_write_mbr(
+/*==========*/
+	byte*			data,	/*!< out: data */
+	const rtr_mbr_t*	mbr)	/*!< in: data */
+{
+	const double* my_mbr = reinterpret_cast<const double*>(mbr);
+
+	for (unsigned i = 0; i < SPDIMS * 2; i++) {
+		mach_double_write(data + i * sizeof(double), my_mbr[i]);
+	}
+}
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+rtr_read_mbr(
+/*==========*/
+	const byte*	data,	/*!< in: data */
+	rtr_mbr_t*	mbr)	/*!< out: MBR */
+{
+	for (unsigned i = 0; i < SPDIMS * 2; i++) {
+		(reinterpret_cast<double*>(mbr))[i] = mach_double_read(
+							data
+							+ i * sizeof(double));
+	}
+}
+
+/*********************************************************//**
+Returns the R-Tree node stored in the parent search path
+@return pointer to R-Tree cursor component in the parent path,
+NULL if parent path is empty or index is larger than num of items contained */
+UNIV_INLINE
+node_visit_t*
+rtr_get_parent_node(
+/*================*/
+	btr_cur_t*	btr_cur,	/*!< in: persistent cursor */
+	ulint		level,		/*!< in: index level of buffer page */
+	ulint		is_insert)	/*!< in: whether it is insert */
+{
+	ulint			num;
+	ulint			tree_height = btr_cur->tree_height;
+	node_visit_t*		found_node = NULL;
+
+	if (level >= tree_height) {
+		return(NULL);
+	}
+
+	mysql_mutex_lock(&btr_cur->rtr_info->rtr_path_mutex);
+
+	num = btr_cur->rtr_info->parent_path->size();
+
+	if (!num) {
+		mysql_mutex_unlock(&btr_cur->rtr_info->rtr_path_mutex);
+		return(NULL);
+	}
+
+	if (is_insert) {
+		ulint	idx = tree_height - level - 1;
+		ut_ad(idx < num);
+
+		found_node = &(*btr_cur->rtr_info->parent_path)[idx];
+	} else {
+		node_visit_t*	node;
+
+		while (num > 0) {
+			node = &(*btr_cur->rtr_info->parent_path)[num - 1];
+
+			if (node->level == level) {
+				found_node = node;
+				break;
+			}
+			num--;
+		}
+	}
+
+	mysql_mutex_unlock(&btr_cur->rtr_info->rtr_path_mutex);
+
+	return(found_node);
+}
+
+/*********************************************************//**
+Returns the R-Tree cursor stored in the parent search path
+@return pointer to R-Tree cursor component */
+UNIV_INLINE
+btr_pcur_t*
+rtr_get_parent_cursor(
+/*==================*/
+	btr_cur_t*	btr_cur,	/*!< in: persistent cursor */
+	ulint		level,		/*!< in: index level of buffer page */
+	ulint		is_insert)	/*!< in: whether insert operation */
+{
+	node_visit_t*   found_node = rtr_get_parent_node(
+					btr_cur, level, is_insert);
+
+	return((found_node) ? found_node->cursor : NULL);
+}
+
+/********************************************************************//**
+Reinitialize a R-Tree search info in btr_cur_t */
+UNIV_INLINE
+void
+rtr_info_reinit_in_cursor(
+/************************/
+	btr_cur_t*	cursor,		/*!< in/out: tree cursor */
+	dict_index_t*	index,		/*!< in: index struct */
+	bool		need_prdt)	/*!< in: Whether predicate lock is
+					needed */
+{
+	rtr_clean_rtr_info(cursor->rtr_info, false);
+	rtr_init_rtr_info(cursor->rtr_info, need_prdt, cursor, index, true);
+}
diff --git a/storage/innobase/include/gis0type.h b/storage/innobase/include/gis0type.h
new file mode 100644
index 00000000..d6a4ef67
--- /dev/null
+++ b/storage/innobase/include/gis0type.h
@@ -0,0 +1,146 @@
+/*****************************************************************************
+
+Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include gis0type.h
+R-tree header file
+
+Created 2013/03/27 Jimmy Yang
+***********************************************************************/
+
+#ifndef gis0type_h
+#define gis0type_h
+
+#include "buf0buf.h"
+#include "data0type.h"
+#include "data0types.h"
+#include "dict0types.h"
+#include "ut0vec.h"
+#include "gis0geo.h"
+
+#include <vector>
+#include <forward_list>
+
+/** Node Sequence Number. Only updated when page splits */
+typedef uint32_t     node_seq_t;
+
+/* RTree internal non-leaf Nodes to be searched, from root to leaf */
+struct node_visit_t {
+	uint32_t	page_no;	/*!< the page number */
+	node_seq_t	seq_no;		/*!< the SSN (split sequence number */
+	ulint		level;		/*!< the page's index level */
+	uint32_t	child_no;	/*!< child page num if for parent
+					recording */
+	btr_pcur_t*	cursor;		/*!< cursor structure if we positioned
+					FIXME: there is no need to use whole
+					btr_pcur_t, just the position related
+					members */
+	double		mbr_inc;	/*!< whether this node needs to be
+					enlarged for insertion */
+};
+
+typedef std::vector<node_visit_t, ut_allocator<node_visit_t> >	rtr_node_path_t;
+
+typedef	struct rtr_rec {
+		rec_t*	r_rec;		/*!< matched record */
+		bool	locked;		/*!< whether the record locked */
+} rtr_rec_t;
+
+typedef std::vector<rtr_rec_t, ut_allocator<rtr_rec_t> >	rtr_rec_vector;
+
+/* Structure for matched records on the leaf page */
+typedef	struct matched_rec {
+	byte*		bufp;		/*!< aligned buffer point */
+	byte		rec_buf[UNIV_PAGE_SIZE_MAX * 2];
+					/*!< buffer used to copy matching rec */
+	buf_block_t	block;		/*!< the shadow buffer block */
+	ulint		used;		/*!< memory used */
+	rtr_rec_vector*	matched_recs;	/*!< vector holding the matching rec */
+	mysql_mutex_t	rtr_match_mutex;/*!< mutex protect the match_recs
+					vector */
+	bool		valid;		/*!< whether result in matched_recs
+					or this search is valid (page not
+					dropped) */
+	bool		locked;		/*!< whether these recs locked */
+} matched_rec_t;
+
+/* In memory representation of a minimum bounding rectangle */
+typedef struct rtr_mbr {
+	double	xmin;			/*!< minimum on x */
+	double	xmax;			/*!< maximum on x */
+	double	ymin;			/*!< minimum on y */
+	double	ymax;			/*!< maximum on y */
+} rtr_mbr_t;
+
+/* Maximum index level for R-Tree, this is consistent with BTR_MAX_LEVELS */
+#define RTR_MAX_LEVELS		100
+
+/* Number of pages we latch at leaf level when there is possible Tree
+modification (split, shrink), we always latch left, current
+and right pages */
+#define RTR_LEAF_LATCH_NUM	3
+
+/** Vectors holding the matching internal pages/nodes and leaf records */
+typedef	struct rtr_info{
+	rtr_node_path_t*path;	/*!< vector holding matching pages */
+	rtr_node_path_t*parent_path;
+				/*!< vector holding parent pages during
+				search */
+	matched_rec_t*	matches;/*!< struct holding matching leaf records */
+	mysql_mutex_t	rtr_path_mutex;
+				/*!< mutex protect the "path" vector */
+	rtr_mbr_t	mbr;	/*!< the search MBR */
+	que_thr_t*      thr;	/*!< the search thread */
+	mem_heap_t*	heap;	/*!< memory heap */
+	btr_cur_t*	cursor;	/*!< cursor used for search */
+	dict_index_t*	index;	/*!< index it is searching */
+	bool		need_prdt_lock;
+				/*!< whether we will need predicate lock
+				the tree */
+	bool		need_page_lock;
+				/*!< whether we will need predicate page lock
+				the tree */
+	bool		allocated;/*!< whether this structure is allocate or
+				on stack */
+	bool		mbr_adj;/*!< whether mbr will need to be enlarged
+				for an insertion operation */
+	bool		fd_del;	/*!< found deleted row */
+	const dtuple_t*	search_tuple;
+				/*!< search tuple being used */
+	page_cur_mode_t	search_mode;
+				/*!< current search mode */
+} rtr_info_t;
+
+/* Tracking structure for all ongoing search for an index */
+struct rtr_info_track_t {
+	/** Active search info */
+	std::forward_list<rtr_info_t*, ut_allocator<rtr_info_t*> > rtr_active;
+	mysql_mutex_t rtr_active_mutex;
+						/*!< mutex to protect
+						rtr_active */
+};
+
+/* This is to record the record movement between pages. Used for corresponding
+lock movement */
+typedef struct rtr_rec_move {
+	rec_t*		old_rec;	/*!< record being moved in old page */
+	rec_t*		new_rec;	/*!< new record location */
+	bool		moved;		/*!< whether lock are moved too */
+} rtr_rec_move_t;
+#endif /*!< gis0rtree.h */
diff --git a/storage/innobase/include/ha0ha.h b/storage/innobase/include/ha0ha.h
new file mode 100644
index 00000000..5aaa559b
--- /dev/null
+++ b/storage/innobase/include/ha0ha.h
@@ -0,0 +1,60 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0ha.h
+The hash table interface for the adaptive hash index
+
+Created 8/18/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef ha0ha_h
+#define ha0ha_h
+
+#include "hash0hash.h"
+#include "page0types.h"
+#include "buf0types.h"
+#include "rem0types.h"
+
+#ifdef BTR_CUR_HASH_ADAPT
+/*************************************************************//**
+Looks for an element in a hash table.
+@return pointer to the data of the first hash table node in chain
+having the fold number, NULL if not found */
+UNIV_INLINE
+const rec_t*
+ha_search_and_get_data(
+/*===================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: folded value of the searched data */
+
+/** The hash table external chain node */
+struct ha_node_t {
+	ulint		fold;	/*!< fold value for the data */
+	ha_node_t*	next;	/*!< next chain node or NULL if none */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	buf_block_t*	block;	/*!< buffer block containing the data, or NULL */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	const rec_t*	data;	/*!< pointer to the data */
+};
+
+#include "ha0ha.inl"
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#endif
diff --git a/storage/innobase/include/ha0ha.inl b/storage/innobase/include/ha0ha.inl
new file mode 100644
index 00000000..0b256257
--- /dev/null
+++ b/storage/innobase/include/ha0ha.inl
@@ -0,0 +1,154 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/ha0ha.ic
+The hash table interface for the adaptive hash index
+
+Created 8/18/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifdef BTR_CUR_HASH_ADAPT
+#include "btr0types.h"
+
+/******************************************************************//**
+Gets a hash node data.
+@return pointer to the data */
+UNIV_INLINE
+const rec_t*
+ha_node_get_data(
+/*=============*/
+	const ha_node_t*	node)	/*!< in: hash chain node */
+{
+	return(node->data);
+}
+
+/******************************************************************//**
+Sets hash node data. */
+UNIV_INLINE
+void
+ha_node_set_data_func(
+/*==================*/
+	ha_node_t*	node,	/*!< in: hash chain node */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	buf_block_t*	block,	/*!< in: buffer block containing the data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	const rec_t*	data)	/*!< in: pointer to the data */
+{
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	node->block = block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	node->data = data;
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/** Sets hash node data.
+@param n in: hash chain node
+@param b in: buffer block containing the data
+@param d in: pointer to the data */
+# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,b,d)
+#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+/** Sets hash node data.
+@param n in: hash chain node
+@param b in: buffer block containing the data
+@param d in: pointer to the data */
+# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,d)
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+/******************************************************************//**
+Gets the next node in a hash chain.
+@return next node, NULL if none */
+UNIV_INLINE
+ha_node_t*
+ha_chain_get_next(
+/*==============*/
+	const ha_node_t*	node)	/*!< in: hash chain node */
+{
+	return(node->next);
+}
+
+/******************************************************************//**
+Gets the first node in a hash chain.
+@return first node, NULL if none */
+UNIV_INLINE
+ha_node_t*
+ha_chain_get_first(
+/*===============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold value determining the chain */
+{
+  return static_cast<ha_node_t*>(table->array[table->calc_hash(fold)].node);
+}
+
+/*************************************************************//**
+Looks for an element in a hash table.
+@return pointer to the data of the first hash table node in chain
+having the fold number, NULL if not found */
+UNIV_INLINE
+const rec_t*
+ha_search_and_get_data(
+/*===================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: folded value of the searched data */
+{
+	ut_ad(btr_search_enabled);
+
+	for (const ha_node_t* node = ha_chain_get_first(table, fold);
+	     node != NULL;
+	     node = ha_chain_get_next(node)) {
+
+		if (node->fold == fold) {
+
+			return(node->data);
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************//**
+Looks for an element when we know the pointer to the data.
+@return pointer to the hash table node, NULL if not found in the table */
+UNIV_INLINE
+ha_node_t*
+ha_search_with_data(
+/*================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold,	/*!< in: folded value of the searched data */
+	const rec_t*	data)	/*!< in: pointer to the data */
+{
+	ha_node_t*	node;
+
+	ut_ad(btr_search_enabled);
+
+	node = ha_chain_get_first(table, fold);
+
+	while (node) {
+		if (node->data == data) {
+
+			return(node);
+		}
+
+		node = ha_chain_get_next(node);
+	}
+
+	return(NULL);
+}
+
+#endif /* BTR_CUR_HASH_ADAPT */
diff --git a/storage/innobase/include/ha0storage.h b/storage/innobase/include/ha0storage.h
new file mode 100644
index 00000000..fdf50a2e
--- /dev/null
+++ b/storage/innobase/include/ha0storage.h
@@ -0,0 +1,137 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0storage.h
+Hash storage.
+Provides a data structure that stores chunks of data in
+its own storage, avoiding duplicates.
+
+Created September 22, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef ha0storage_h
+#define ha0storage_h
+
+#include "univ.i"
+
+/** This value is used by default by ha_storage_create(). More memory
+is allocated later when/if it is needed. */
+#define HA_STORAGE_DEFAULT_HEAP_BYTES	1024
+
+/** This value is used by default by ha_storage_create(). It is a
+constant per ha_storage's lifetime. */
+#define HA_STORAGE_DEFAULT_HASH_CELLS	4096
+
+/** Hash storage */
+struct ha_storage_t;
+
+/*******************************************************************//**
+Creates a hash storage. If any of the parameters is 0, then a default
+value is used.
+@return own: hash storage */
+UNIV_INLINE
+ha_storage_t*
+ha_storage_create(
+/*==============*/
+	ulint	initial_heap_bytes,	/*!< in: initial heap's size */
+	ulint	initial_hash_cells);	/*!< in: initial number of cells
+					in the hash table */
+
+/*******************************************************************//**
+Copies data into the storage and returns a pointer to the copy. If the
+same data chunk is already present, then pointer to it is returned.
+Data chunks are considered to be equal if len1 == len2 and
+memcmp(data1, data2, len1) == 0. If "data" is not present (and thus
+data_len bytes need to be allocated) and the size of storage is going to
+become more than "memlim" then "data" is not added and NULL is returned.
+To disable this behavior "memlim" can be set to 0, which stands for
+"no limit".
+@return pointer to the copy */
+const void*
+ha_storage_put_memlim(
+/*==================*/
+	ha_storage_t*	storage,	/*!< in/out: hash storage */
+	const void*	data,		/*!< in: data to store */
+	ulint		data_len,	/*!< in: data length */
+	ulint		memlim);	/*!< in: memory limit to obey */
+
+/*******************************************************************//**
+Same as ha_storage_put_memlim() but without memory limit.
+@param storage in/out: hash storage
+@param data in: data to store
+@param data_len in: data length
+@return pointer to the copy of the string */
+#define ha_storage_put(storage, data, data_len)	\
+	ha_storage_put_memlim((storage), (data), (data_len), 0)
+
+/*******************************************************************//**
+Copies string into the storage and returns a pointer to the copy. If the
+same string is already present, then pointer to it is returned.
+Strings are considered to be equal if strcmp(str1, str2) == 0.
+@param storage in/out: hash storage
+@param str in: string to put
+@return pointer to the copy of the string */
+#define ha_storage_put_str(storage, str)	\
+	((const char*) ha_storage_put((storage), (str), strlen(str) + 1))
+
+/*******************************************************************//**
+Copies string into the storage and returns a pointer to the copy obeying
+a memory limit.
+If the same string is already present, then pointer to it is returned.
+Strings are considered to be equal if strcmp(str1, str2) == 0.
+@param storage in/out: hash storage
+@param str in: string to put
+@param memlim in: memory limit to obey
+@return pointer to the copy of the string */
+#define ha_storage_put_str_memlim(storage, str, memlim)	\
+	((const char*) ha_storage_put_memlim((storage), (str),	\
+					     strlen(str) + 1, (memlim)))
+
+/*******************************************************************//**
+Empties a hash storage, freeing memory occupied by data chunks.
+This invalidates any pointers previously returned by ha_storage_put().
+The hash storage is not invalidated itself and can be used again. */
+UNIV_INLINE
+void
+ha_storage_empty(
+/*=============*/
+	ha_storage_t**	storage);	/*!< in/out: hash storage */
+
+/*******************************************************************//**
+Frees a hash storage and everything it contains, it cannot be used after
+this call.
+This invalidates any pointers previously returned by ha_storage_put(). */
+UNIV_INLINE
+void
+ha_storage_free(
+/*============*/
+	ha_storage_t*	storage);	/*!< in, own: hash storage */
+
+/*******************************************************************//**
+Gets the size of the memory used by a storage.
+@return bytes used */
+UNIV_INLINE
+ulint
+ha_storage_get_size(
+/*================*/
+	const ha_storage_t*	storage);	/*!< in: hash storage */
+
+#include "ha0storage.inl"
+
+#endif /* ha0storage_h */
diff --git a/storage/innobase/include/ha0storage.inl b/storage/innobase/include/ha0storage.inl
new file mode 100644
index 00000000..df9679cf
--- /dev/null
+++ b/storage/innobase/include/ha0storage.inl
@@ -0,0 +1,142 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0storage.ic
+Hash storage.
+Provides a data structure that stores chunks of data in
+its own storage, avoiding duplicates.
+
+Created September 24, 2007 Vasil Dimov
+*******************************************************/
+
+#include "hash0hash.h"
+#include "mem0mem.h"
+
+/** Hash storage for strings */
+struct ha_storage_t {
+	mem_heap_t*	heap;	/*!< memory heap from which memory is
+				allocated */
+	hash_table_t	hash;	/*!< hash table used to avoid
+				duplicates */
+};
+
+/** Objects of this type are stored in ha_storage_t */
+struct ha_storage_node_t {
+	ulint			data_len;/*!< length of the data */
+	const void*		data;	/*!< pointer to data */
+	ha_storage_node_t*	next;	/*!< next node in hash chain */
+};
+
+/*******************************************************************//**
+Creates a hash storage. If any of the parameters is 0, then a default
+value is used.
+@return own: hash storage */
+UNIV_INLINE
+ha_storage_t*
+ha_storage_create(
+/*==============*/
+	ulint	initial_heap_bytes,	/*!< in: initial heap's size */
+	ulint	initial_hash_cells)	/*!< in: initial number of cells
+					in the hash table */
+{
+	ha_storage_t*	storage;
+	mem_heap_t*	heap;
+
+	if (initial_heap_bytes == 0) {
+
+		initial_heap_bytes = HA_STORAGE_DEFAULT_HEAP_BYTES;
+	}
+
+	if (initial_hash_cells == 0) {
+
+		initial_hash_cells = HA_STORAGE_DEFAULT_HASH_CELLS;
+	}
+
+	/* we put "storage" within "storage->heap" */
+
+	heap = mem_heap_create(sizeof(ha_storage_t)
+			       + initial_heap_bytes);
+
+	storage = (ha_storage_t*) mem_heap_alloc(heap,
+						 sizeof(ha_storage_t));
+
+	storage->heap = heap;
+	storage->hash.create(initial_hash_cells);
+
+	return(storage);
+}
+
+/*******************************************************************//**
+Empties a hash storage, freeing memory occupied by data chunks.
+This invalidates any pointers previously returned by ha_storage_put().
+The hash storage is not invalidated itself and can be used again. */
+UNIV_INLINE
+void
+ha_storage_empty(
+/*=============*/
+	ha_storage_t**	storage)	/*!< in/out: hash storage */
+{
+	ha_storage_t	temp_storage;
+
+	temp_storage.heap = (*storage)->heap;
+	temp_storage.hash = (*storage)->hash;
+
+	temp_storage.hash.clear();
+	mem_heap_empty(temp_storage.heap);
+
+	*storage = (ha_storage_t*) mem_heap_alloc(temp_storage.heap,
+						  sizeof(ha_storage_t));
+
+	(*storage)->heap = temp_storage.heap;
+	(*storage)->hash = temp_storage.hash;
+}
+
+/*******************************************************************//**
+Frees a hash storage and everything it contains, it cannot be used after
+this call.
+This invalidates any pointers previously returned by ha_storage_put(). */
+UNIV_INLINE
+void
+ha_storage_free(
+/*============*/
+	ha_storage_t*	storage)	/*!< in, own: hash storage */
+{
+	storage->hash.free();
+	mem_heap_free(storage->heap);
+}
+
+/*******************************************************************//**
+Gets the size of the memory used by a storage.
+@return bytes used */
+UNIV_INLINE
+ulint
+ha_storage_get_size(
+/*================*/
+	const ha_storage_t*	storage)	/*!< in: hash storage */
+{
+	ulint	ret;
+
+	ret = mem_heap_get_size(storage->heap);
+
+	/* this assumes hash->heap and hash->heaps are NULL */
+	ret += sizeof(hash_table_t);
+	ret += sizeof(hash_cell_t) * storage->hash.n_cells;
+
+	return(ret);
+}
diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h
new file mode 100644
index 00000000..d5239ec3
--- /dev/null
+++ b/storage/innobase/include/ha_prototypes.h
@@ -0,0 +1,476 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ha_prototypes.h
+Prototypes for global functions in ha_innodb.cc that are called by
+InnoDB C code.
+
+NOTE: This header is intended to insulate InnoDB from SQL names and functions.
+Do not include any headers other than univ.i into this unless they are very
+simple headers.
+************************************************************************/
+
+#ifndef HA_INNODB_PROTOTYPES_H
+#define HA_INNODB_PROTOTYPES_H
+
+#include "univ.i"
+
+#ifndef UNIV_INNOCHECKSUM
+
+/* Forward declarations */
+class THD;
+class Field;
+
+// JAN: TODO missing features:
+#undef MYSQL_FT_INIT_EXT
+#undef MYSQL_PFS
+#undef MYSQL_STORE_FTS_DOC_ID
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes
+the result to "buf". The result is converted to "system_charset_info".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+ulint
+innobase_raw_format(
+/*================*/
+	const char*	data,		/*!< in: raw data */
+	ulint		data_len,	/*!< in: raw data length
+					in bytes */
+	ulint		charset_coll,	/*!< in: charset collation */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size);	/*!< in: output buffer size
+					in bytes */
+
+/*****************************************************************//**
+Invalidates the MySQL query cache for the table. */
+void
+innobase_invalidate_query_cache(
+/*============================*/
+	trx_t*		trx,		/*!< in: transaction which
+					modifies the table */
+	const char*	full_name);	/*!< in: concatenation of
+					database name, path separator,
+					table name, null char NUL;
+					NOTE that in Windows this is
+					always in LOWER CASE! */
+
+/** Quote a standard SQL identifier like tablespace, index or column name.
+@param[in]	file	output stream
+@param[in]	trx	InnoDB transaction, or NULL
+@param[in]	id	identifier to quote */
+void
+innobase_quote_identifier(
+	FILE*		file,
+	trx_t*		trx,
+	const char*	id);
+
+/** Quote an standard SQL identifier like tablespace, index or column name.
+Return the string as an std:string object.
+@param[in]	trx	InnoDB transaction, or NULL
+@param[in]	id	identifier to quote
+@return a std::string with id properly quoted. */
+std::string
+innobase_quote_identifier(
+	trx_t*		trx,
+	const char*	id);
+
+/*****************************************************************//**
+Convert a table name to the MySQL system_charset_info (UTF-8).
+@return pointer to the end of buf */
+char*
+innobase_convert_name(
+/*==================*/
+	char*		buf,	/*!< out: buffer for converted identifier */
+	ulint		buflen,	/*!< in: length of buf, in bytes */
+	const char*	id,	/*!< in: table name to convert */
+	ulint		idlen,	/*!< in: length of id, in bytes */
+	THD*		thd);	/*!< in: MySQL connection thread, or NULL */
+
+/******************************************************************//**
+Returns true if the transaction this thread is processing has edited
+non-transactional tables. Used by the deadlock detector when deciding
+which transaction to rollback in case of a deadlock - we try to avoid
+rolling back transactions that have edited non-transactional tables.
+@return true if non-transactional tables have been edited */
+ibool
+thd_has_edited_nontrans_tables(
+/*===========================*/
+	THD*	thd);	/*!< in: thread handle */
+
+/*************************************************************//**
+Prints info of a THD object (== user session thread) to the given file. */
+void
+innobase_mysql_print_thd(
+/*=====================*/
+	FILE*	f,		/*!< in: output stream */
+	THD*	thd,		/*!< in: pointer to a MySQL THD object */
+	uint	max_query_len);	/*!< in: max query length to print, or 0 to
+				   use the default max length */
+
+/** Converts a MySQL type to an InnoDB type. Note that this function returns
+the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
+VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'.
+@param[out]	unsigned_flag		DATA_UNSIGNED if an 'unsigned type';
+at least ENUM and SET, and unsigned integer types are 'unsigned types'
+@param[in]	f			MySQL Field
+@return DATA_BINARY, DATA_VARCHAR, ... */
+uint8_t
+get_innobase_type_from_mysql_type(unsigned *unsigned_flag, const Field *field);
+
+/******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively.
+@return 0 if a=b, <0 if a<b, >1 if a>b */
+int
+innobase_strcasecmp(
+/*================*/
+	const char*	a,	/*!< in: first string to compare */
+	const char*	b);	/*!< in: second string to compare */
+
+/** Strip dir name from a full path name and return only the file name
+@param[in]	path_name	full path name
+@return file name or "null" if no file name */
+const char*
+innobase_basename(
+	const char*	path_name);
+
+/******************************************************************//**
+Converts an identifier to a table name. */
+void
+innobase_convert_from_table_id(
+/*===========================*/
+	CHARSET_INFO*	cs,	/*!< in: the 'from' character set */
+	char*		to,	/*!< out: converted identifier */
+	const char*	from,	/*!< in: identifier to convert */
+	ulint		len);	/*!< in: length of 'to', in bytes; should
+				be at least 5 * strlen(to) + 1 */
+/******************************************************************//**
+Converts an identifier to UTF-8. */
+void
+innobase_convert_from_id(
+/*=====================*/
+	CHARSET_INFO*	cs,	/*!< in: the 'from' character set */
+	char*		to,	/*!< out: converted identifier */
+	const char*	from,	/*!< in: identifier to convert */
+	ulint		len);	/*!< in: length of 'to', in bytes;
+				should be at least 3 * strlen(to) + 1 */
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+void
+innobase_casedn_str(
+/*================*/
+	char*	a);	/*!< in/out: string to put in lower case */
+
+#ifdef WITH_WSREP
+ulint wsrep_innobase_mysql_sort(int mysql_type, uint charset_number,
+                             unsigned char* str, ulint str_length,
+                             ulint buf_length);
+#endif /* WITH_WSREP */
+
+extern "C" struct charset_info_st *thd_charset(THD *thd);
+
+/** Get high resolution timestamp for the current query start time.
+The timestamp is not anchored to any specific point in time,
+but can be used for comparison.
+@param thd user thread
+@retval timestamp in microseconds precision
+*/
+extern "C" unsigned long long thd_start_utime(const MYSQL_THD thd);
+
+
+/** Determines the current SQL statement.
+Thread unsafe, can only be called from the thread owning the THD.
+@param[in]	thd	MySQL thread handle
+@param[out]	length	Length of the SQL statement
+@return			SQL statement string */
+const char*
+innobase_get_stmt_unsafe(
+	THD*	thd,
+	size_t*	length);
+
+/******************************************************************//**
+This function is used to find the storage length in bytes of the first n
+characters for prefix indexes using a multibyte character set. The function
+finds charset information and returns length of prefix_len characters in the
+index field in bytes.
+@return number of bytes occupied by the first n characters */
+ulint
+innobase_get_at_most_n_mbchars(
+/*===========================*/
+	ulint charset_id,	/*!< in: character set id */
+	ulint prefix_len,	/*!< in: prefix length in bytes of the index
+				(this has to be divided by mbmaxlen to get the
+				number of CHARACTERS n in the prefix) */
+	ulint data_len,		/*!< in: length of the string in bytes */
+	const char* str);	/*!< in: character string */
+
+/** Get status of innodb_tmpdir.
+@param[in]	thd	thread handle, or NULL to query
+			the global innodb_tmpdir.
+@retval NULL if innodb_tmpdir="" */
+const char *thd_innodb_tmpdir(THD *thd);
+
+/******************************************************************//**
+Returns the lock wait timeout for the current connection.
+@return the lock wait timeout, in seconds */
+uint&
+thd_lock_wait_timeout(
+/*==================*/
+	THD*	thd);	/*!< in: thread handle, or NULL to query
+			the global innodb_lock_wait_timeout */
+
+/******************************************************************//**
+compare two character string case insensitively according to their charset. */
+int
+innobase_fts_text_case_cmp(
+/*=======================*/
+	const void*	cs,		/*!< in: Character set */
+	const void*	p1,		/*!< in: key */
+	const void*	p2);		/*!< in: node */
+
+/******************************************************************//**
+Returns true if transaction should be flagged as read-only.
+@return true if the thd is marked as read-only */
+bool
+thd_trx_is_read_only(
+/*=================*/
+	THD*	thd);	/*!< in/out: thread handle */
+
+/******************************************************************//**
+Check if the transaction is an auto-commit transaction. TRUE also
+implies that it is a SELECT (read-only) transaction.
+@return true if the transaction is an auto commit read-only transaction. */
+ibool
+thd_trx_is_auto_commit(
+/*===================*/
+	THD*	thd);	/*!< in: thread handle, or NULL */
+
+/*****************************************************************//**
+A wrapper function of innobase_convert_name(), convert a table name
+to the MySQL system_charset_info (UTF-8) and quote it if needed.
+@return pointer to the end of buf */
+void
+innobase_format_name(
+/*==================*/
+	char*		buf,	/*!< out: buffer for converted identifier */
+	ulint		buflen,	/*!< in: length of buf, in bytes */
+	const char*	name);	/*!< in: table name to format */
+
+/** Corresponds to Sql_condition:enum_warning_level. */
+enum ib_log_level_t {
+	IB_LOG_LEVEL_INFO,
+	IB_LOG_LEVEL_WARN,
+	IB_LOG_LEVEL_ERROR,
+	IB_LOG_LEVEL_FATAL
+};
+
+/******************************************************************//**
+Use this when the args are first converted to a formatted string and then
+passed to the format string from errmsg-utf8.txt. The error message format
+must be: "Some string ... %s".
+
+Push a warning message to the client, it is a wrapper around:
+
+void push_warning_printf(
+	THD *thd, Sql_condition::enum_warning_level level,
+	uint code, const char *format, ...);
+*/
+void
+ib_errf(
+/*====*/
+	THD*		thd,		/*!< in/out: session */
+	ib_log_level_t	level,		/*!< in: warning level */
+	ib_uint32_t	code,		/*!< MySQL error code */
+	const char*	format,		/*!< printf format */
+	...)				/*!< Args */
+	MY_ATTRIBUTE((format(printf, 4, 5)));
+
+/******************************************************************//**
+Use this when the args are passed to the format string from
+errmsg-utf8.txt directly as is.
+
+Push a warning message to the client, it is a wrapper around:
+
+void push_warning_printf(
+	THD *thd, Sql_condition::enum_warning_level level,
+	uint code, const char *format, ...);
+*/
+void
+ib_senderrf(
+/*========*/
+	THD*		thd,		/*!< in/out: session */
+	ib_log_level_t	level,		/*!< in: warning level */
+	ib_uint32_t	code,		/*!< MySQL error code */
+	...);				/*!< Args */
+
+extern const char* 	TROUBLESHOOTING_MSG;
+extern const char* 	TROUBLESHOOT_DATADICT_MSG;
+extern const char* 	BUG_REPORT_MSG;
+extern const char* 	FORCE_RECOVERY_MSG;
+extern const char*      OPERATING_SYSTEM_ERROR_MSG;
+extern const char*      FOREIGN_KEY_CONSTRAINTS_MSG;
+extern const char*      SET_TRANSACTION_MSG;
+extern const char*      INNODB_PARAMETERS_MSG;
+
+/******************************************************************//**
+Returns the NUL terminated value of glob_hostname.
+@return pointer to glob_hostname. */
+const char*
+server_get_hostname();
+/*=================*/
+
+/*********************************************************************//**
+Compute the next autoinc value.
+
+For MySQL replication the autoincrement values can be partitioned among
+the nodes. The offset is the start or origin of the autoincrement value
+for a particular node. For n nodes the increment will be n and the offset
+will be in the interval [1, n]. The formula tries to allocate the next
+value for a particular node.
+
+Note: This function is also called with increment set to the number of
+values we want to reserve for multi-value inserts e.g.,
+
+	INSERT INTO T VALUES(), (), ();
+
+innobase_next_autoinc() will be called with increment set to 3 where
+autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for
+the multi-value INSERT above.
+@return the next value */
+ulonglong
+innobase_next_autoinc(
+/*==================*/
+	ulonglong	current,	/*!< in: Current value */
+	ulonglong	need,		/*!< in: count of values needed */
+	ulonglong	step,		/*!< in: AUTOINC increment step */
+	ulonglong	offset,		/*!< in: AUTOINC offset */
+	ulonglong	max_value)	/*!< in: max value for type */
+	MY_ATTRIBUTE((pure, warn_unused_result));
+
+/**********************************************************************
+Converts an identifier from my_charset_filename to UTF-8 charset. */
+uint
+innobase_convert_to_system_charset(
+/*===============================*/
+	char*           to,		/* out: converted identifier */
+	const char*     from,		/* in: identifier to convert */
+	ulint           len,		/* in: length of 'to', in bytes */
+	uint*		errors);	/* out: error return */
+/**********************************************************************
+Check if the length of the identifier exceeds the maximum allowed.
+The input to this function is an identifier in charset my_charset_filename.
+return true when length of identifier is too long. */
+my_bool
+innobase_check_identifier_length(
+/*=============================*/
+	const char*	id);	/* in: identifier to check.  it must belong
+				to charset my_charset_filename */
+
+/**********************************************************************
+Converts an identifier from my_charset_filename to UTF-8 charset. */
+uint
+innobase_convert_to_system_charset(
+/*===============================*/
+	char*		to,		/* out: converted identifier */
+	const char*	from,		/* in: identifier to convert */
+	ulint		len,		/* in: length of 'to', in bytes */
+	uint*		errors);	/* out: error return */
+
+/**********************************************************************
+Converts an identifier from my_charset_filename to UTF-8 charset. */
+uint
+innobase_convert_to_filename_charset(
+/*=================================*/
+	char*		to,	/* out: converted identifier */
+	const char*	from,	/* in: identifier to convert */
+	ulint		len);	/* in: length of 'to', in bytes */
+
+/********************************************************************//**
+Helper function to push warnings from InnoDB internals to SQL-layer. */
+void
+ib_push_warning(
+	trx_t*		trx,	/*!< in: trx */
+	dberr_t		error,	/*!< in: error code to push as warning */
+	const char	*format,/*!< in: warning message */
+	...);
+
+/********************************************************************//**
+Helper function to push warnings from InnoDB internals to SQL-layer. */
+void
+ib_push_warning(
+	void*		ithd,	/*!< in: thd */
+	dberr_t		error,	/*!< in: error code to push as warning */
+	const char	*format,/*!< in: warning message */
+	...);
+
+/********************************************************************//**
+Helper function to push warnings from InnoDB internals to SQL-layer. */
+void
+ib_foreign_warn(
+	trx_t*		trx,	/*!< in: trx */
+	dberr_t		error,	/*!< in: error code to push as warning */
+	const char	*table_name,
+	const char	*format,/*!< in: warning message */
+	...);
+
+/*****************************************************************//**
+Normalizes a table name string. A normalized name consists of the
+database name catenated to '/' and table name. An example:
+test/mytable. On Windows normalization puts both the database name and the
+table name always to lower case if "set_lower_case" is set to TRUE. */
+void
+normalize_table_name_c_low(
+/*=======================*/
+	char*		norm_name,	/*!< out: normalized name as a
+					null-terminated string */
+	const char*	name,		/*!< in: table name string */
+	bool		set_lower_case); /*!< in: true if we want to set
+					name to lower case */
+
+/** Create a MYSQL_THD for a background thread and mark it as such.
+@param name thread info for SHOW PROCESSLIST
+@return new MYSQL_THD */
+MYSQL_THD innobase_create_background_thd(const char* name);
+
+/** Destroy a THD object associated with a background task.
+@param[in]	thd	MYSQL_THD to destroy */
+void destroy_background_thd(MYSQL_THD thd);
+
+/** Close opened tables, free memory, delete items for a MYSQL_THD.
+@param[in]	thd	MYSQL_THD to reset */
+void
+innobase_reset_background_thd(MYSQL_THD);
+
+#ifdef WITH_WSREP
+/** Append table-level exclusive key.
+@param thd   MySQL thread handle
+@param table table
+@retval false on success
+@retval true on failure */
+struct dict_table_t;
+bool wsrep_append_table_key(MYSQL_THD thd, const dict_table_t &table);
+#endif /* WITH_WSREP */
+
+#endif /* !UNIV_INNOCHECKSUM */
+#endif /* HA_INNODB_PROTOTYPES_H */
diff --git a/storage/innobase/include/handler0alter.h b/storage/innobase/include/handler0alter.h
new file mode 100644
index 00000000..add983a0
--- /dev/null
+++ b/storage/innobase/include/handler0alter.h
@@ -0,0 +1,108 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/handler0alter.h
+Smart ALTER TABLE
+*******************************************************/
+
+#include "rem0types.h"
+
+/*************************************************************//**
+Copies an InnoDB record to table->record[0]. */
+void
+innobase_rec_to_mysql(
+/*==================*/
+	struct TABLE*		table,	/*!< in/out: MySQL table */
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_offs*		offsets)/*!< in: rec_get_offsets(
+					rec, index, ...) */
+	MY_ATTRIBUTE((nonnull));
+
+/*************************************************************//**
+Copies an InnoDB index entry to table->record[0]. */
+void
+innobase_fields_to_mysql(
+/*=====================*/
+	struct TABLE*		table,	/*!< in/out: MySQL table */
+	const dict_index_t*	index,	/*!< in: InnoDB index */
+	const dfield_t*		fields)	/*!< in: InnoDB index fields */
+	MY_ATTRIBUTE((nonnull));
+
+/*************************************************************//**
+Copies an InnoDB row to table->record[0]. */
+void
+innobase_row_to_mysql(
+/*==================*/
+	struct TABLE*		table,	/*!< in/out: MySQL table */
+	const dict_table_t*	itab,	/*!< in: InnoDB table */
+	const dtuple_t*		row)	/*!< in: InnoDB row */
+	MY_ATTRIBUTE((nonnull));
+
+/** Generate the next autoinc based on a snapshot of the session
+auto_increment_increment and auto_increment_offset variables. */
+struct ib_sequence_t {
+
+	/**
+	@param thd the session
+	@param start_value the lower bound
+	@param max_value the upper bound (inclusive) */
+	ib_sequence_t(THD* thd, ulonglong start_value, ulonglong max_value);
+
+	/** Postfix increment
+	@return the value to insert */
+	ulonglong operator++(int) UNIV_NOTHROW;
+
+	/** Check if the autoinc "sequence" is exhausted.
+	@return true if the sequence is exhausted */
+	bool eof() const UNIV_NOTHROW
+	{
+		return(m_eof);
+	}
+
+	/**
+	@return the next value in the sequence */
+	ulonglong last() const UNIV_NOTHROW
+	{
+		ut_ad(m_next_value > 0);
+
+		return(m_next_value);
+	}
+
+	/** @return maximum column value
+	@retval	0	if not adding AUTO_INCREMENT column */
+	ulonglong max_value() const { return m_max_value; }
+
+private:
+	/** Maximum value if adding an AUTO_INCREMENT column, else 0 */
+	ulonglong	m_max_value;
+
+	/** Value of auto_increment_increment */
+	ulong		m_increment;
+
+	/** Value of auto_increment_offset */
+	ulong		m_offset;
+
+	/** Next value in the sequence */
+	ulonglong	m_next_value;
+
+	/** true if no more values left in the sequence */
+	bool		m_eof;
+};
diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h
new file mode 100644
index 00000000..867ad9e0
--- /dev/null
+++ b/storage/innobase/include/hash0hash.h
@@ -0,0 +1,190 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/hash0hash.h
+The simple hash table utility
+
+Created 5/20/1997 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "ut0rnd.h"
+#include "ut0new.h"
+
+struct hash_table_t;
+struct hash_cell_t
+{
+  /** singly-linked, nullptr terminated list of hash buckets */
+  void *node;
+
+  /** Append an element.
+  @tparam T      type of the element
+  @param insert  the being-inserted element
+  @param next    the next-element pointer in T */
+  template<typename T>
+  void append(T &insert, T *T::*next)
+  {
+    void **after;
+    for (after= &node; *after;
+         after= reinterpret_cast<void**>(&(static_cast<T*>(*after)->*next)));
+    insert.*next= nullptr;
+    *after= &insert;
+  }
+};
+
+/*******************************************************************//**
+Inserts a struct to a hash table. */
+
+#define HASH_INSERT(TYPE, NAME, TABLE, FOLD, DATA)\
+do {\
+	hash_cell_t*	cell3333;\
+	TYPE*		struct3333;\
+\
+	(DATA)->NAME = NULL;\
+\
+	cell3333 = &(TABLE)->array[(TABLE)->calc_hash(FOLD)];	\
+\
+	if (cell3333->node == NULL) {\
+		cell3333->node = DATA;\
+	} else {\
+		struct3333 = (TYPE*) cell3333->node;\
+\
+		while (struct3333->NAME != NULL) {\
+\
+			struct3333 = (TYPE*) struct3333->NAME;\
+		}\
+\
+		struct3333->NAME = DATA;\
+	}\
+} while (0)
+
+#ifdef UNIV_HASH_DEBUG
+# define HASH_ASSERT_VALID(DATA) ut_a((void*) (DATA) != (void*) -1)
+# define HASH_INVALIDATE(DATA, NAME) *(void**) (&DATA->NAME) = (void*) -1
+#else
+# define HASH_ASSERT_VALID(DATA) do {} while (0)
+# define HASH_INVALIDATE(DATA, NAME) do {} while (0)
+#endif
+
+/*******************************************************************//**
+Deletes a struct from a hash table. */
+
+#define HASH_DELETE(TYPE, NAME, TABLE, FOLD, DATA)\
+do {\
+	hash_cell_t*	cell3333;\
+	TYPE*		struct3333;\
+\
+	cell3333 = &(TABLE)->array[(TABLE)->calc_hash(FOLD)]; \
+\
+	if (cell3333->node == DATA) {\
+		HASH_ASSERT_VALID(DATA->NAME);\
+		cell3333->node = DATA->NAME;\
+	} else {\
+		struct3333 = (TYPE*) cell3333->node;\
+\
+		while (struct3333->NAME != DATA) {\
+\
+			struct3333 = (TYPE*) struct3333->NAME;\
+			ut_a(struct3333);\
+		}\
+\
+		struct3333->NAME = DATA->NAME;\
+	}\
+	HASH_INVALIDATE(DATA, NAME);\
+} while (0)
+
+/*******************************************************************//**
+Gets the first struct in a hash chain, NULL if none. */
+
+#define HASH_GET_FIRST(TABLE, HASH_VAL) (TABLE)->array[HASH_VAL].node
+
+/*******************************************************************//**
+Gets the next struct in a hash chain, NULL if none. */
+
+#define HASH_GET_NEXT(NAME, DATA)	((DATA)->NAME)
+
+/********************************************************************//**
+Looks for a struct in a hash table. */
+#define HASH_SEARCH(NAME, TABLE, FOLD, TYPE, DATA, ASSERTION, TEST)\
+{\
+	(DATA) = (TYPE) HASH_GET_FIRST(TABLE, (TABLE)->calc_hash(FOLD)); \
+	HASH_ASSERT_VALID(DATA);\
+\
+	while ((DATA) != NULL) {\
+		ASSERTION;\
+		if (TEST) {\
+			break;\
+		} else {\
+			HASH_ASSERT_VALID(HASH_GET_NEXT(NAME, DATA));\
+			(DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA);\
+		}\
+	}\
+}
+
+/********************************************************************//**
+Looks for an item in all hash buckets. */
+#define HASH_SEARCH_ALL(NAME, TABLE, TYPE, DATA, ASSERTION, TEST)	\
+do {									\
+	ulint	i3333;							\
+									\
+	for (i3333 = (TABLE)->n_cells; i3333--; ) {			\
+		(DATA) = (TYPE) HASH_GET_FIRST(TABLE, i3333);		\
+									\
+		while ((DATA) != NULL) {				\
+			HASH_ASSERT_VALID(DATA);			\
+			ASSERTION;					\
+									\
+			if (TEST) {					\
+				break;					\
+			}						\
+									\
+			(DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA);	\
+		}							\
+									\
+		if ((DATA) != NULL) {					\
+			break;						\
+		}							\
+	}								\
+} while (0)
+
+/** Hash table with singly-linked overflow lists */
+struct hash_table_t
+{
+  /** number of elements in array (a prime number) */
+  ulint n_cells;
+  /** the hash array */
+  hash_cell_t *array;
+
+  /** Create the hash table.
+  @param n  the lower bound of n_cells */
+  void create(ulint n)
+  {
+    n_cells= ut_find_prime(n);
+    array= static_cast<hash_cell_t*>(ut_zalloc_nokey(n_cells * sizeof *array));
+  }
+
+  /** Clear the hash table. */
+  void clear() { memset(array, 0, n_cells * sizeof *array); }
+
+  /** Free the hash table. */
+  void free() { ut_free(array); array= nullptr; }
+
+  ulint calc_hash(ulint fold) const { return ut_hash_ulint(fold, n_cells); }
+};
diff --git a/storage/innobase/include/ibuf0ibuf.h b/storage/innobase/include/ibuf0ibuf.h
new file mode 100644
index 00000000..c246b2ef
--- /dev/null
+++ b/storage/innobase/include/ibuf0ibuf.h
@@ -0,0 +1,436 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ibuf0ibuf.h
+Insert buffer
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef ibuf0ibuf_h
+#define ibuf0ibuf_h
+
+#include "mtr0mtr.h"
+#include "dict0mem.h"
+#include "fsp0fsp.h"
+
+/** Default value for maximum on-disk size of change buffer in terms
+of percentage of the buffer pool. */
+#define CHANGE_BUFFER_DEFAULT_SIZE	(25)
+
+/* Possible operations buffered in the insert/whatever buffer. See
+ibuf_insert(). DO NOT CHANGE THE VALUES OF THESE, THEY ARE STORED ON DISK. */
+typedef enum {
+	IBUF_OP_INSERT = 0,
+	IBUF_OP_DELETE_MARK = 1,
+	IBUF_OP_DELETE = 2,
+
+	/* Number of different operation types. */
+	IBUF_OP_COUNT = 3
+} ibuf_op_t;
+
+/** Combinations of operations that can be buffered.
+@see innodb_change_buffering_names */
+enum ibuf_use_t {
+	IBUF_USE_NONE = 0,
+	IBUF_USE_INSERT,	/* insert */
+	IBUF_USE_DELETE_MARK,	/* delete */
+	IBUF_USE_INSERT_DELETE_MARK,	/* insert+delete */
+	IBUF_USE_DELETE,	/* delete+purge */
+	IBUF_USE_ALL		/* insert+delete+purge */
+};
+
+/** Operations that can currently be buffered. */
+extern ulong		innodb_change_buffering;
+
+/** Insert buffer struct */
+struct ibuf_t{
+	Atomic_relaxed<ulint> size;	/*!< current size of the ibuf index
+					tree, in pages */
+	Atomic_relaxed<ulint> max_size;	/*!< recommended maximum size of the
+					ibuf index tree, in pages */
+	ulint		seg_size;	/*!< allocated pages of the file
+					segment containing ibuf header and
+					tree */
+	bool		empty;		/*!< Protected by the page
+					latch of the root page of the
+					insert buffer tree
+					(FSP_IBUF_TREE_ROOT_PAGE_NO). true
+					if and only if the insert
+					buffer tree is empty. */
+	ulint		free_list_len;	/*!< length of the free list */
+	ulint		height;		/*!< tree height */
+	dict_index_t*	index;		/*!< insert buffer index */
+
+	/** number of pages merged */
+	Atomic_counter<ulint> n_merges;
+	Atomic_counter<ulint> n_merged_ops[IBUF_OP_COUNT];
+					/*!< number of operations of each type
+					merged to index pages */
+	Atomic_counter<ulint> n_discarded_ops[IBUF_OP_COUNT];
+					/*!< number of operations of each type
+					discarded without merging due to the
+					tablespace being deleted or the
+					index being dropped */
+};
+
+/** The insert buffer control structure */
+extern ibuf_t		ibuf;
+
+/* The purpose of the insert buffer is to reduce random disk access.
+When we wish to insert a record into a non-unique secondary index and
+the B-tree leaf page where the record belongs to is not in the buffer
+pool, we insert the record into the insert buffer B-tree, indexed by
+(space_id, page_no).  When the page is eventually read into the buffer
+pool, we look up the insert buffer B-tree for any modifications to the
+page, and apply these upon the completion of the read operation.  This
+is called the insert buffer merge. */
+
+/* The insert buffer merge must always succeed.  To guarantee this,
+the insert buffer subsystem keeps track of the free space in pages for
+which it can buffer operations.  Two bits per page in the insert
+buffer bitmap indicate the available space in coarse increments.  The
+free bits in the insert buffer bitmap must never exceed the free space
+on a page.  It is safe to decrement or reset the bits in the bitmap in
+a mini-transaction that is committed before the mini-transaction that
+affects the free space.  It is unsafe to increment the bits in a
+separately committed mini-transaction, because in crash recovery, the
+free bits could momentarily be set too high. */
+
+/******************************************************************//**
+Creates the insert buffer data structure at a database startup.
+@return DB_SUCCESS or failure */
+dberr_t
+ibuf_init_at_db_start(void);
+/*=======================*/
+/*********************************************************************//**
+Updates the max_size value for ibuf. */
+void
+ibuf_max_size_update(
+/*=================*/
+	ulint	new_val);	/*!< in: new value in terms of
+				percentage of the buffer pool size */
+/*********************************************************************//**
+Reads the biggest tablespace id from the high end of the insert buffer
+tree and updates the counter in fil_system. */
+void
+ibuf_update_max_tablespace_id(void);
+/*===============================*/
+/***************************************************************//**
+Starts an insert buffer mini-transaction. */
+UNIV_INLINE
+void
+ibuf_mtr_start(
+/*===========*/
+	mtr_t*	mtr)	/*!< out: mini-transaction */
+	MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Commits an insert buffer mini-transaction. */
+UNIV_INLINE
+void
+ibuf_mtr_commit(
+/*============*/
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull));
+/************************************************************************//**
+Resets the free bits of the page in the ibuf bitmap. This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is safe
+to decrement or reset the bits in the bitmap in a mini-transaction
+that is committed before the mini-transaction that affects the free
+space. */
+void
+ibuf_reset_free_bits(
+/*=================*/
+	buf_block_t*	block);	/*!< in: index page; free bits are set to 0
+				if the index is a non-clustered
+				non-unique, and page level is 0 */
+/************************************************************************//**
+Updates the free bits of an uncompressed page in the ibuf bitmap if
+there is not enough free on the page any more.  This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is
+unsafe to increment the bits in a separately committed
+mini-transaction, because in crash recovery, the free bits could
+momentarily be set too high.  It is only safe to use this function for
+decrementing the free bits.  Should more free space become available,
+we must not update the free bits here, because that would break crash
+recovery. */
+UNIV_INLINE
+void
+ibuf_update_free_bits_if_full(
+/*==========================*/
+	buf_block_t*	block,	/*!< in: index page to which we have added new
+				records; the free bits are updated if the
+				index is non-clustered and non-unique and
+				the page level is 0, and the page becomes
+				fuller */
+	ulint		max_ins_size,/*!< in: value of maximum insert size with
+				reorganize before the latest operation
+				performed to the page */
+	ulint		increase);/*!< in: upper limit for the additional space
+				used in the latest operation, if known, or
+				ULINT_UNDEFINED */
+/**********************************************************************//**
+Updates the free bits for an uncompressed page to reflect the present
+state.  Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+void
+ibuf_update_free_bits_low(
+/*======================*/
+	const buf_block_t*	block,		/*!< in: index page */
+	ulint			max_ins_size,	/*!< in: value of
+						maximum insert size
+						with reorganize before
+						the latest operation
+						performed to the page */
+	mtr_t*			mtr);		/*!< in/out: mtr */
+/**********************************************************************//**
+Updates the free bits for a compressed page to reflect the present
+state.  Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+void
+ibuf_update_free_bits_zip(
+/*======================*/
+	buf_block_t*	block,	/*!< in/out: index page */
+	mtr_t*		mtr);	/*!< in/out: mtr */
+/**********************************************************************//**
+Updates the free bits for the two pages to reflect the present state.
+Does this in the mtr given, which means that the latching order rules
+virtually prevent any further operations until mtr is committed.
+NOTE: The free bits in the insert buffer bitmap must never exceed the
+free space on a page.  It is safe to set the free bits in the same
+mini-transaction that updated the pages. */
+void
+ibuf_update_free_bits_for_two_pages_low(
+/*====================================*/
+	buf_block_t*	block1,	/*!< in: index page */
+	buf_block_t*	block2,	/*!< in: index page */
+	mtr_t*		mtr);	/*!< in: mtr */
+/**********************************************************************//**
+A basic partial test if an insert to the insert buffer could be possible and
+recommended. */
+UNIV_INLINE
+ibool
+ibuf_should_try(
+/*============*/
+	dict_index_t*	index,			/*!< in: index where to insert */
+	ulint		ignore_sec_unique);	/*!< in: if != 0, we should
+						ignore UNIQUE constraint on
+						a secondary index when we
+						decide */
+/******************************************************************//**
+Returns TRUE if the current OS thread is performing an insert buffer
+routine.
+
+For instance, a read-ahead of non-ibuf pages is forbidden by threads
+that are executing an insert buffer routine.
+@return TRUE if inside an insert buffer routine */
+UNIV_INLINE
+ibool
+ibuf_inside(
+/*========*/
+	const mtr_t*	mtr)	/*!< in: mini-transaction */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Checks if a page address is an ibuf bitmap page (level 3 page) address.
+@param[in]	page_id		page id
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@return TRUE if a bitmap page */
+inline bool ibuf_bitmap_page(const page_id_t page_id, ulint zip_size)
+{
+	ut_ad(ut_is_2pow(zip_size));
+	ulint size = zip_size ? zip_size : srv_page_size;
+	return (page_id.page_no() & (size - 1)) == FSP_IBUF_BITMAP_OFFSET;
+}
+
+/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
+Must not be called when recv_no_ibuf_operations==true.
+@param[in]	page_id		page id
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	x_latch		FALSE if relaxed check (avoid latching the
+bitmap page)
+@param[in,out]	mtr		mtr which will contain an x-latch to the
+bitmap page if the page is not one of the fixed address ibuf pages, or NULL,
+in which case a new transaction is created.
+@return true if level 2 or level 3 page */
+bool
+ibuf_page_low(
+	const page_id_t		page_id,
+	ulint			zip_size,
+#ifdef UNIV_DEBUG
+	bool			x_latch,
+#endif /* UNIV_DEBUG */
+	mtr_t*			mtr)
+	MY_ATTRIBUTE((warn_unused_result));
+
+#ifdef UNIV_DEBUG
+/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
+Must not be called when recv_no_ibuf_operations==true.
+@param[in]	page_id		tablespace/page identifier
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]	mtr		mini-transaction or NULL
+@return TRUE if level 2 or level 3 page */
+# define ibuf_page(page_id, zip_size, mtr)	\
+	ibuf_page_low(page_id, zip_size, true, mtr)
+
+#else /* UNIV_DEBUG */
+
+/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
+Must not be called when recv_no_ibuf_operations==true.
+@param[in]	page_id		tablespace/page identifier
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]	mtr		mini-transaction or NULL
+@return TRUE if level 2 or level 3 page */
+# define ibuf_page(page_id, zip_size, mtr)	\
+	ibuf_page_low(page_id, zip_size, mtr)
+
+#endif /* UNIV_DEBUG */
+/***********************************************************************//**
+Frees excess pages from the ibuf free list. This function is called when an OS
+thread calls fsp services to allocate a new file segment, or a new page to a
+file segment, and the thread did not own the fsp latch before this call. */
+void
+ibuf_free_excess_pages(void);
+/*========================*/
+
+/** Buffer an operation in the change buffer, instead of applying it
+directly to the file page, if this is possible. Does not do it if the index
+is clustered or unique.
+@param[in]	op		operation type
+@param[in]	entry		index entry to insert
+@param[in,out]	index		index where to insert
+@param[in]	page_id		page id where to insert
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]	thr		query thread
+@return true if success */
+bool
+ibuf_insert(
+	ibuf_op_t		op,
+	const dtuple_t*		entry,
+	dict_index_t*		index,
+	const page_id_t		page_id,
+	ulint			zip_size,
+	que_thr_t*		thr);
+
+/** Check whether buffered changes exist for a page.
+@param[in]	id		page identifier
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@return whether buffered changes exist */
+bool ibuf_page_exists(const page_id_t id, ulint zip_size);
+
+/** When an index page is read from a disk to the buffer pool, this function
+applies any buffered operations to the page and deletes the entries from the
+insert buffer. If the page is not read, but created in the buffer pool, this
+function deletes its buffered entries from the insert buffer; there can
+exist entries for such a page if the page belonged to an index which
+subsequently was dropped.
+@param block    X-latched page to try to apply changes to, or NULL to discard
+@param page_id  page identifier
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return error code */
+dberr_t ibuf_merge_or_delete_for_page(buf_block_t *block,
+                                      const page_id_t page_id,
+                                      ulint zip_size);
+
+/** Delete all change buffer entries for a tablespace,
+in DISCARD TABLESPACE, IMPORT TABLESPACE, or read-ahead.
+@param[in]	space		missing or to-be-discarded tablespace */
+void ibuf_delete_for_discarded_space(uint32_t space);
+
+/** Contract the change buffer by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read
+@retval 0 if ibuf.empty */
+ulint ibuf_contract();
+
+/** Contracts insert buffer trees by reading pages referring to space_id
+to the buffer pool.
+@returns number of pages merged.*/
+ulint
+ibuf_merge_space(
+/*=============*/
+	ulint	space);	/*!< in: space id */
+
+/******************************************************************//**
+Looks if the insert buffer is empty.
+@return true if empty */
+bool
+ibuf_is_empty(void);
+/*===============*/
+/******************************************************************//**
+Prints info of ibuf. */
+void
+ibuf_print(
+/*=======*/
+	FILE*	file);	/*!< in: file where to print */
+/********************************************************************
+Read the first two bytes from a record's fourth field (counter field in new
+records; something else in older records).
+@return "counter" field, or ULINT_UNDEFINED if for some reason it can't be read */
+ulint
+ibuf_rec_get_counter(
+/*=================*/
+	const rec_t*	rec);	/*!< in: ibuf record */
+/******************************************************************//**
+Closes insert buffer and frees the data structures. */
+void
+ibuf_close(void);
+/*============*/
+
+/** Check the insert buffer bitmaps on IMPORT TABLESPACE.
+@param[in]	trx	transaction
+@param[in,out]	space	tablespace being imported
+@return DB_SUCCESS or error code */
+dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Update free bits and buffered bits for bulk loaded page.
+@param block   secondary index leaf page
+@param mtr     mini-transaction
+@param reset   whether the page is full */
+void ibuf_set_bitmap_for_bulk_load(buf_block_t *block, mtr_t *mtr, bool reset);
+
+#define IBUF_HEADER_PAGE_NO	FSP_IBUF_HEADER_PAGE_NO
+#define IBUF_TREE_ROOT_PAGE_NO	FSP_IBUF_TREE_ROOT_PAGE_NO
+
+/* The ibuf header page currently contains only the file segment header
+for the file segment from which the pages for the ibuf tree are allocated */
+#define IBUF_HEADER		PAGE_DATA
+#define	IBUF_TREE_SEG_HEADER	0	/* fseg header for ibuf tree */
+
+/* The insert buffer tree itself is always located in space 0. */
+#define IBUF_SPACE_ID		static_cast<ulint>(0)
+
+#include "ibuf0ibuf.inl"
+
+#endif
diff --git a/storage/innobase/include/ibuf0ibuf.inl b/storage/innobase/include/ibuf0ibuf.inl
new file mode 100644
index 00000000..003bf22a
--- /dev/null
+++ b/storage/innobase/include/ibuf0ibuf.inl
@@ -0,0 +1,282 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ibuf0ibuf.ic
+Insert buffer
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "page0page.h"
+#include "page0zip.h"
+#include "fsp0types.h"
+#include "buf0lru.h"
+
+/** An index page must contain at least srv_page_size /
+IBUF_PAGE_SIZE_PER_FREE_SPACE bytes of free space for ibuf to try to
+buffer inserts to this page.  If there is this much of free space, the
+corresponding bits are set in the ibuf bitmap. */
+#define IBUF_PAGE_SIZE_PER_FREE_SPACE	32
+
+/***************************************************************//**
+Starts an insert buffer mini-transaction. */
+UNIV_INLINE
+void
+ibuf_mtr_start(
+/*===========*/
+	mtr_t*	mtr)	/*!< out: mini-transaction */
+{
+	mtr_start(mtr);
+	mtr->enter_ibuf();
+
+	if (high_level_read_only || srv_read_only_mode) {
+		mtr_set_log_mode(mtr, MTR_LOG_NO_REDO);
+	}
+
+}
+/***************************************************************//**
+Commits an insert buffer mini-transaction. */
+UNIV_INLINE
+void
+ibuf_mtr_commit(
+/*============*/
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(mtr->is_inside_ibuf());
+	ut_d(mtr->exit_ibuf());
+
+	mtr_commit(mtr);
+}
+
+/************************************************************************//**
+Sets the free bit of the page in the ibuf bitmap. This is done in a separate
+mini-transaction, hence this operation does not restrict further work to only
+ibuf bitmap operations, which would result if the latch to the bitmap page
+were kept. */
+void
+ibuf_set_free_bits_func(
+/*====================*/
+	buf_block_t*	block,	/*!< in: index page of a non-clustered index;
+				free bit is reset if page level is 0 */
+#ifdef UNIV_IBUF_DEBUG
+	ulint		max_val,/*!< in: ULINT_UNDEFINED or a maximum
+				value which the bits must have before
+				setting; this is for debugging */
+#endif /* UNIV_IBUF_DEBUG */
+	ulint		val);	/*!< in: value to set: < 4 */
+#ifdef UNIV_IBUF_DEBUG
+# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,max,v)
+#else /* UNIV_IBUF_DEBUG */
+# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,v)
+#endif /* UNIV_IBUF_DEBUG */
+
+/**********************************************************************//**
+A basic partial test if an insert to the insert buffer could be possible and
+recommended. */
+UNIV_INLINE
+ibool
+ibuf_should_try(
+/*============*/
+	dict_index_t*	index,			/*!< in: index where to insert */
+	ulint		ignore_sec_unique)	/*!< in: if != 0, we should
+						ignore UNIQUE constraint on
+						a secondary index when we
+						decide */
+{
+  if (index->type & (DICT_CLUSTERED | DICT_IBUF | DICT_SPATIAL) ||
+      !innodb_change_buffering || !ibuf.max_size)
+    return false;
+  if (!ignore_sec_unique && index->is_unique())
+    return false;
+  if (index->table->quiesce != QUIESCE_NONE)
+    return false;
+  for (unsigned i= 0; i < index->n_fields; i++)
+    if (index->fields[i].descending)
+      return false;
+  return true;
+}
+
+/******************************************************************//**
+Returns TRUE if the current OS thread is performing an insert buffer
+routine.
+
+For instance, a read-ahead of non-ibuf pages is forbidden by threads
+that are executing an insert buffer routine.
+@return TRUE if inside an insert buffer routine */
+UNIV_INLINE
+ibool
+ibuf_inside(
+/*========*/
+	const mtr_t*	mtr)	/*!< in: mini-transaction */
+{
+	return(mtr->is_inside_ibuf());
+}
+
+/** Translates the free space on a page to a value in the ibuf bitmap.
+@param[in]	page_size	page size in bytes
+@param[in]	max_ins_size	maximum insert size after reorganize for
+the page
+@return value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_bits(
+	ulint	page_size,
+	ulint	max_ins_size)
+{
+	ulint	n;
+	ut_ad(ut_is_2pow(page_size));
+	ut_ad(page_size > IBUF_PAGE_SIZE_PER_FREE_SPACE);
+
+	n = max_ins_size / (page_size / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+
+	if (n == 3) {
+		n = 2;
+	}
+
+	if (n > 3) {
+		n = 3;
+	}
+
+	return(n);
+}
+
+/*********************************************************************//**
+Translates the free space on a compressed page to a value in the ibuf bitmap.
+@return value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_zip(
+/*==========================*/
+	const buf_block_t*	block)	/*!< in: buffer block */
+{
+	ulint			max_ins_size;
+	const page_zip_des_t*	page_zip;
+	lint			zip_max_ins;
+
+	ut_ad(block->page.zip.data);
+
+	/* Consider the maximum insert size on the uncompressed page
+	without reorganizing the page. We must not assume anything
+	about the compression ratio. If zip_max_ins > max_ins_size and
+	there is 1/4 garbage on the page, recompression after the
+	reorganize could fail, in theory. So, let us guarantee that
+	merging a buffered insert to a compressed page will always
+	succeed without reorganizing or recompressing the page, just
+	by using the page modification log. */
+	max_ins_size = page_get_max_insert_size(
+		buf_block_get_frame(block), 1);
+
+	page_zip = buf_block_get_page_zip(block);
+	zip_max_ins = page_zip_max_ins_size(page_zip,
+					    FALSE/* not clustered */);
+
+	if (zip_max_ins < 0) {
+		return(0);
+	} else if (max_ins_size > (ulint) zip_max_ins) {
+		max_ins_size = (ulint) zip_max_ins;
+	}
+
+	return(ibuf_index_page_calc_free_bits(block->physical_size(),
+					      max_ins_size));
+}
+
+/*********************************************************************//**
+Translates the free space on a page to a value in the ibuf bitmap.
+@return value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free(
+/*======================*/
+	const buf_block_t*	block)	/*!< in: buffer block */
+{
+	if (!block->page.zip.data) {
+		ulint	max_ins_size;
+
+		max_ins_size = page_get_max_insert_size_after_reorganize(
+			buf_block_get_frame(block), 1);
+
+		return(ibuf_index_page_calc_free_bits(
+				block->physical_size(), max_ins_size));
+	} else {
+		return(ibuf_index_page_calc_free_zip(block));
+	}
+}
+
+/************************************************************************//**
+Updates the free bits of an uncompressed page in the ibuf bitmap if
+there is not enough free on the page any more.  This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is
+unsafe to increment the bits in a separately committed
+mini-transaction, because in crash recovery, the free bits could
+momentarily be set too high.  It is only safe to use this function for
+decrementing the free bits.  Should more free space become available,
+we must not update the free bits here, because that would break crash
+recovery. */
+UNIV_INLINE
+void
+ibuf_update_free_bits_if_full(
+/*==========================*/
+	buf_block_t*	block,	/*!< in: index page to which we have added new
+				records; the free bits are updated if the
+				index is non-clustered and non-unique and
+				the page level is 0, and the page becomes
+				fuller */
+	ulint		max_ins_size,/*!< in: value of maximum insert size with
+				reorganize before the latest operation
+				performed to the page */
+	ulint		increase)/*!< in: upper limit for the additional space
+				used in the latest operation, if known, or
+				ULINT_UNDEFINED */
+{
+	ulint	before;
+	ulint	after;
+
+	ut_ad(buf_block_get_page_zip(block) == NULL);
+
+	before = ibuf_index_page_calc_free_bits(
+		srv_page_size, max_ins_size);
+
+	if (max_ins_size >= increase) {
+		compile_time_assert(ULINT32_UNDEFINED > UNIV_PAGE_SIZE_MAX);
+		after = ibuf_index_page_calc_free_bits(
+			srv_page_size, max_ins_size - increase);
+#ifdef UNIV_IBUF_DEBUG
+		ut_a(after <= ibuf_index_page_calc_free(block));
+#endif
+	} else {
+		after = ibuf_index_page_calc_free(block);
+	}
+
+	if (after == 0) {
+		/* We move the page to the front of the buffer pool LRU list:
+		the purpose of this is to prevent those pages to which we
+		cannot make inserts using the insert buffer from slipping
+		out of the buffer pool */
+
+		buf_page_make_young(&block->page);
+	}
+
+	if (before > after) {
+		ibuf_set_free_bits(block, after, before);
+	}
+}
diff --git a/storage/innobase/include/lock0iter.h b/storage/innobase/include/lock0iter.h
new file mode 100644
index 00000000..a7e61395
--- /dev/null
+++ b/storage/innobase/include/lock0iter.h
@@ -0,0 +1,66 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0iter.h
+Lock queue iterator type and function prototypes.
+
+Created July 16, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef lock0iter_h
+#define lock0iter_h
+
+#include "lock0types.h"
+
+struct lock_queue_iterator_t {
+	const lock_t*	current_lock;
+	/* In case this is a record lock queue (not table lock queue)
+	then bit_no is the record number within the heap in which the
+	record is stored. */
+	ulint		bit_no;
+};
+
+/*******************************************************************//**
+Initialize lock queue iterator so that it starts to iterate from
+"lock". bit_no specifies the record number within the heap where the
+record is stored. It can be undefined (ULINT_UNDEFINED) in two cases:
+1. If the lock is a table lock, thus we have a table lock queue;
+2. If the lock is a record lock and it is a wait lock. In this case
+   bit_no is calculated in this function by using
+   lock_rec_find_set_bit(). There is exactly one bit set in the bitmap
+   of a wait lock. */
+void
+lock_queue_iterator_reset(
+/*======================*/
+	lock_queue_iterator_t*	iter,	/*!< out: iterator */
+	const lock_t*		lock,	/*!< in: lock to start from */
+	ulint			bit_no);/*!< in: record number in the
+					heap */
+
+/*******************************************************************//**
+Gets the previous lock in the lock queue, returns NULL if there are no
+more locks (i.e. the current lock is the first one). The iterator is
+receded (if not-NULL is returned).
+@return previous lock or NULL */
+const lock_t*
+lock_queue_iterator_get_prev(
+/*=========================*/
+	lock_queue_iterator_t*	iter);	/*!< in/out: iterator */
+
+#endif /* lock0iter_h */
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
new file mode 100644
index 00000000..59ee7f55
--- /dev/null
+++ b/storage/innobase/include/lock0lock.h
@@ -0,0 +1,1271 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2022, Oracle and/or its affiliates.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0lock.h
+The transaction lock system
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef lock0lock_h
+#define lock0lock_h
+
+#include "buf0types.h"
+#include "trx0trx.h"
+#include "mtr0types.h"
+#include "rem0types.h"
+#include "hash0hash.h"
+#include "srv0srv.h"
+#include "ut0vec.h"
+#include "gis0rtree.h"
+#include "lock0prdt.h"
+#include "transactional_lock_guard.h"
+
+// Forward declaration
+class ReadView;
+
+/** The value of innodb_deadlock_detect */
+extern my_bool innodb_deadlock_detect;
+/** The value of innodb_deadlock_report */
+extern ulong innodb_deadlock_report;
+
+namespace Deadlock
+{
+  /** The allowed values of innodb_deadlock_report */
+  enum report { REPORT_OFF, REPORT_BASIC, REPORT_FULL };
+}
+
+/*********************************************************************//**
+Gets the heap_no of the smallest user record on a page.
+@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
+UNIV_INLINE
+ulint
+lock_get_min_heap_no(
+/*=================*/
+	const buf_block_t*	block);	/*!< in: buffer block */
+
+/** Discard locks for an index when purging DELETE FROM SYS_INDEXES
+after an aborted CREATE INDEX operation.
+@param index   a stale index on which ADD INDEX operation was aborted */
+ATTRIBUTE_COLD void lock_discard_for_index(const dict_index_t &index);
+
+/*************************************************************//**
+Updates the lock table when we have reorganized a page. NOTE: we copy
+also the locks set on the infimum of the page; the infimum may carry
+locks if an update of a record is occurring on the page, and its locks
+were temporarily stored on the infimum. */
+void
+lock_move_reorganize_page(
+/*======================*/
+	const buf_block_t*	block,	/*!< in: old index page, now
+					reorganized */
+	const buf_block_t*	oblock);/*!< in: copy of the old, not
+					reorganized page */
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list end is moved to another page. */
+void
+lock_move_rec_list_end(
+/*===================*/
+	const buf_block_t*	new_block,	/*!< in: index page to move to */
+	const buf_block_t*	block,		/*!< in: index page */
+	const rec_t*		rec);		/*!< in: record on page: this
+						is the first record moved */
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list start is moved to another page. */
+void
+lock_move_rec_list_start(
+/*=====================*/
+	const buf_block_t*	new_block,	/*!< in: index page to move to */
+	const buf_block_t*	block,		/*!< in: index page */
+	const rec_t*		rec,		/*!< in: record on page:
+						this is the first
+						record NOT copied */
+	const rec_t*		old_end);	/*!< in: old
+						previous-to-last
+						record on new_page
+						before the records
+						were copied */
+/*************************************************************//**
+Updates the lock table when a page is split to the right. */
+void
+lock_update_split_right(
+/*====================*/
+	const buf_block_t*	right_block,	/*!< in: right page */
+	const buf_block_t*	left_block);	/*!< in: left page */
+/*************************************************************//**
+Updates the lock table when a page is merged to the right. */
+void
+lock_update_merge_right(
+/*====================*/
+	const buf_block_t*	right_block,	/*!< in: right page to
+						which merged */
+	const rec_t*		orig_succ,	/*!< in: original
+						successor of infimum
+						on the right page
+						before merge */
+	const buf_block_t*	left_block);	/*!< in: merged index
+						page which will be
+						discarded */
+/** Update locks when the root page is copied to another in
+btr_root_raise_and_insert(). Note that we leave lock structs on the
+root page, even though they do not make sense on other than leaf
+pages: the reason is that in a pessimistic update the infimum record
+of the root page will act as a dummy carrier of the locks of the record
+to be updated. */
+void lock_update_root_raise(const buf_block_t &block, const page_id_t root);
+/** Update the lock table when a page is copied to another.
+@param new_block  the target page
+@param old        old page (not index root page) */
+void lock_update_copy_and_discard(const buf_block_t &new_block, page_id_t old);
+
+/** Update gap locks between the last record of the left_block and the
+first record of the right_block when a record is about to be inserted
+at the start of the right_block, even though it should "naturally" be
+inserted as the last record of the left_block according to the
+current node pointer in the parent page.
+
+That is, we assume that the lowest common ancestor of the left_block
+and right_block routes the key of the new record to the left_block,
+but a heuristic which tries to avoid overflowing left_block has chosen
+to insert the record into right_block instead. Said ancestor performs
+this routing by comparing the key of the record to a "split point" -
+all records greater or equal to than the split point (node pointer)
+are in right_block, and smaller ones in left_block.
+The split point may be smaller than the smallest key in right_block.
+
+The gap between the last record on the left_block and the first record
+on the right_block is represented as a gap lock attached to the supremum
+pseudo-record of left_block, and a gap lock attached to the new first
+record of right_block.
+
+Thus, inserting the new record, and subsequently adjusting the node
+pointers in parent pages to values smaller or equal to the new
+records' key, will mean that gap will be sliced at a different place
+("moved to the left"): fragment of the 1st gap will now become treated
+as 2nd. Therefore, we must copy any GRANTED locks from 1st gap to the
+2nd gap. Any WAITING locks must be of INSERT_INTENTION type (as no
+other GAP locks ever wait for anything) and can stay at 1st gap, as
+their only purpose is to notify the requester they can retry
+insertion, and there's no correctness requirement to avoid waking them
+up too soon.
+@param left_block   left page
+@param right_block  right page */
+void lock_update_node_pointer(const buf_block_t *left_block,
+                              const buf_block_t *right_block);
+/*************************************************************//**
+Updates the lock table when a page is split to the left. */
+void
+lock_update_split_left(
+/*===================*/
+	const buf_block_t*	right_block,	/*!< in: right page */
+	const buf_block_t*	left_block);	/*!< in: left page */
+/** Update the lock table when a page is merged to the left.
+@param left      left page
+@param orig_pred original predecessor of supremum on the left page before merge
+@param right     merged, to-be-discarded right page */
+void lock_update_merge_left(const buf_block_t& left, const rec_t *orig_pred,
+                            const page_id_t right);
+
+/** Update the locks when a page is split and merged to two pages,
+in defragmentation. */
+void lock_update_split_and_merge(
+	const buf_block_t* left_block,	/*!< in: left page to which merged */
+	const rec_t* orig_pred,		/*!< in: original predecessor of
+					supremum on the left page before merge*/
+	const buf_block_t* right_block);/*!< in: right page from which merged */
+/*************************************************************//**
+Resets the original locks on heir and replaces them with gap type locks
+inherited from rec. */
+void
+lock_rec_reset_and_inherit_gap_locks(
+/*=================================*/
+	const buf_block_t&	heir_block,	/*!< in: block containing the
+						record which inherits */
+	const page_id_t		donor,		/*!< in: page containing the
+						record from which inherited;
+						does NOT reset the locks on
+						this record */
+	ulint			heir_heap_no,	/*!< in: heap_no of the
+						inheriting record */
+	ulint			heap_no);	/*!< in: heap_no of the
+						donating record */
+/*************************************************************//**
+Updates the lock table when a page is discarded. */
+void
+lock_update_discard(
+/*================*/
+	const buf_block_t*	heir_block,	/*!< in: index page
+						which will inherit the locks */
+	ulint			heir_heap_no,	/*!< in: heap_no of the record
+						which will inherit the locks */
+	const buf_block_t*	block);		/*!< in: index page
+						which will be discarded */
+/*************************************************************//**
+Updates the lock table when a new user record is inserted. */
+void
+lock_update_insert(
+/*===============*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec);	/*!< in: the inserted record */
+/*************************************************************//**
+Updates the lock table when a record is removed. */
+void
+lock_update_delete(
+/*===============*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec);	/*!< in: the record to be removed */
+/*********************************************************************//**
+Stores on the page infimum record the explicit locks of another record.
+This function is used to store the lock state of a record when it is
+updated and the size of the record changes in the update. The record
+is in such an update moved, perhaps to another page. The infimum record
+acts as a dummy carrier record, taking care of lock releases while the
+actual record is being moved. */
+void
+lock_rec_store_on_page_infimum(
+/*===========================*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec);	/*!< in: record whose lock state
+					is stored on the infimum
+					record of the same page; lock
+					bits are reset on the
+					record */
+/** Restore the explicit lock requests on a single record, where the
+state was stored on the infimum of a page.
+@param block   buffer block containing rec
+@param rec     record whose lock state is restored
+@param donator page (rec is not necessarily on this page)
+whose infimum stored the lock state; lock bits are reset on the infimum */
+void lock_rec_restore_from_page_infimum(const buf_block_t &block,
+					const rec_t *rec, page_id_t donator);
+
+/**
+Create a table lock, without checking for deadlocks or lock compatibility.
+@param table      table on which the lock is created
+@param type_mode  lock type and mode
+@param trx        transaction
+@param c_lock     conflicting lock
+@return the created lock object */
+lock_t *lock_table_create(dict_table_t *table, unsigned type_mode, trx_t *trx,
+                          lock_t *c_lock= nullptr);
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate insert of
+a record. If they do, first tests if the query thread should anyway
+be suspended for some reason; if not, then puts the transaction and
+the query thread to the lock wait state and inserts a waiting request
+for a gap x-lock to the lock queue.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_rec_insert_check_and_lock(
+/*===========================*/
+	const rec_t*	rec,	/*!< in: record after which to insert */
+	buf_block_t*	block,	/*!< in/out: buffer block of rec */
+	dict_index_t*	index,	/*!< in: index */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	bool*		inherit)/*!< out: set to true if the new
+				inserted record maybe should inherit
+				LOCK_GAP type locks from the successor
+				record */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify (update,
+delete mark, or delete unmark) of a clustered index record. If they do,
+first tests if the query thread should anyway be suspended for some
+reason; if not, then puts the transaction and the query thread to the
+lock wait state and inserts a waiting request for a record x-lock to the
+lock queue.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_clust_rec_modify_check_and_lock(
+/*=================================*/
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: record which should be
+					modified */
+	dict_index_t*		index,	/*!< in: clustered index */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	que_thr_t*		thr)	/*!< in: query thread */
+	MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify
+(delete mark or delete unmark) of a secondary index record.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_sec_rec_modify_check_and_lock(
+/*===============================*/
+	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+				bit is set, does nothing */
+	buf_block_t*	block,	/*!< in/out: buffer block of rec */
+	const rec_t*	rec,	/*!< in: record which should be
+				modified; NOTE: as this is a secondary
+				index, we always have to modify the
+				clustered index record first: see the
+				comment below */
+	dict_index_t*	index,	/*!< in: secondary index */
+	que_thr_t*	thr,	/*!< in: query thread
+				(can be NULL if BTR_NO_LOCKING_FLAG) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************************//**
+Like lock_clust_rec_read_check_and_lock(), but reads a
+secondary index record.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_sec_rec_read_check_and_lock(
+/*=============================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record or page
+					supremum record which should
+					be read or passed over by a
+					read cursor */
+	dict_index_t*		index,	/*!< in: secondary index */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	lock_mode		mode,	/*!< in: mode of the lock which
+					the read cursor should set on
+					records: LOCK_S or LOCK_X; the
+					latter is possible in
+					SELECT FOR UPDATE */
+	unsigned		gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP */
+	que_thr_t*		thr);	/*!< in: query thread */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_clust_rec_read_check_and_lock(
+/*===============================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record or page
+					supremum record which should
+					be read or passed over by a
+					read cursor */
+	dict_index_t*		index,	/*!< in: clustered index */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	lock_mode		mode,	/*!< in: mode of the lock which
+					the read cursor should set on
+					records: LOCK_S or LOCK_X; the
+					latter is possible in
+					SELECT FOR UPDATE */
+	unsigned		gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP */
+	que_thr_t*		thr);	/*!< in: query thread */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record. This is an alternative version of
+lock_clust_rec_read_check_and_lock() that does not require the parameter
+"offsets".
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_clust_rec_read_check_and_lock_alt(
+/*===================================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record or page
+					supremum record which should
+					be read or passed over by a
+					read cursor */
+	dict_index_t*		index,	/*!< in: clustered index */
+	lock_mode		mode,	/*!< in: mode of the lock which
+					the read cursor should set on
+					records: LOCK_S or LOCK_X; the
+					latter is possible in
+					SELECT FOR UPDATE */
+	unsigned		gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP */
+	que_thr_t*		thr)	/*!< in: query thread */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Acquire a table lock.
+@param table   table to be locked
+@param fktable pointer to table, in case of a FOREIGN key check
+@param mode    lock mode
+@param thr     SQL execution thread
+@retval DB_SUCCESS    if the lock was acquired
+@retval DB_DEADLOCK   if a deadlock occurred, or fktable && *fktable != table
+@retval DB_LOCK_WAIT  if lock_wait() must be invoked */
+dberr_t lock_table(dict_table_t *table, dict_table_t *const*fktable,
+                   lock_mode mode, que_thr_t *thr)
+  MY_ATTRIBUTE((warn_unused_result));
+
+/** Create a table lock object for a resurrected transaction.
+@param table    table to be X-locked
+@param trx      transaction
+@param mode     LOCK_X or LOCK_IX */
+void lock_table_resurrect(dict_table_t *table, trx_t *trx, lock_mode mode);
+
+/** Sets a lock on a table based on the given mode.
+@param table	table to lock
+@param trx	transaction
+@param mode	LOCK_X or LOCK_S
+@param no_wait  whether to skip handling DB_LOCK_WAIT
+@return error code */
+dberr_t lock_table_for_trx(dict_table_t *table, trx_t *trx, lock_mode mode,
+                           bool no_wait= false)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Exclusively lock the data dictionary tables.
+@param trx  dictionary transaction
+@return error code
+@retval DB_SUCCESS on success */
+dberr_t lock_sys_tables(trx_t *trx);
+
+/*************************************************************//**
+Removes a granted record lock of a transaction from the queue and grants
+locks to other transactions waiting in the queue if they now are entitled
+to a lock. */
+void
+lock_rec_unlock(
+/*============*/
+	trx_t*			trx,	/*!< in/out: transaction that has
+					set a record lock */
+	const page_id_t		id,	/*!< in: page containing rec */
+	const rec_t*		rec,	/*!< in: record */
+	lock_mode		lock_mode);/*!< in: LOCK_S or LOCK_X */
+
+/** Release the explicit locks of a committing transaction,
+and release possible other transactions waiting because of these locks. */
+void lock_release(trx_t* trx);
+
+/** Release the explicit locks of a committing transaction while
+dict_sys.latch is exclusively locked,
+and release possible other transactions waiting because of these locks. */
+void lock_release_on_drop(trx_t *trx);
+
+/** Release non-exclusive locks on XA PREPARE,
+and release possible other transactions waiting because of these locks. */
+void lock_release_on_prepare(trx_t *trx);
+
+/** Release locks on a table whose creation is being rolled back */
+ATTRIBUTE_COLD void lock_release_on_rollback(trx_t *trx, dict_table_t *table);
+
+/**********************************************************************//**
+Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED,
+if none found.
+@return bit index == heap number of the record, or ULINT_UNDEFINED if
+none found */
+ulint
+lock_rec_find_set_bit(
+/*==================*/
+	const lock_t*	lock);	/*!< in: record lock with at least one
+				bit set */
+
+/*********************************************************************//**
+Checks if a lock request lock1 has to wait for request lock2.
+@return whether lock1 has to wait for lock2 to be removed */
+bool
+lock_has_to_wait(
+/*=============*/
+	const lock_t*	lock1,	/*!< in: waiting lock */
+	const lock_t*	lock2);	/*!< in: another lock; NOTE that it is
+				assumed that this has a lock bit set
+				on the same record as in lock1 if the
+				locks are record locks */
+/*********************************************************************//**
+Reports that a transaction id is insensible, i.e., in the future. */
+ATTRIBUTE_COLD
+void
+lock_report_trx_id_insanity(
+/*========================*/
+	trx_id_t	trx_id,		/*!< in: trx id */
+	const rec_t*	rec,		/*!< in: user record */
+	dict_index_t*	index,		/*!< in: index */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	trx_id_t	max_trx_id);	/*!< in: trx_sys.get_max_trx_id() */
+/*********************************************************************//**
+Prints info of locks for all transactions.
+@return FALSE if not able to acquire lock_sys.latch (and display info) */
+ibool
+lock_print_info_summary(
+/*====================*/
+	FILE*	file,	/*!< in: file where to print */
+	ibool   nowait)	/*!< in: whether to wait for lock_sys.latch */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Prints transaction lock wait and MVCC state.
+@param[in,out]	file	file where to print
+@param[in]	trx	transaction
+@param[in]	now	current my_hrtime_coarse() */
+void lock_trx_print_wait_and_mvcc_state(FILE *file, const trx_t *trx,
+                                        my_hrtime_t now);
+
+/*********************************************************************//**
+Prints info of locks for each transaction. This function will release
+lock_sys.latch, which the caller must be holding in exclusive mode. */
+void
+lock_print_info_all_transactions(
+/*=============================*/
+	FILE*	file);	/*!< in: file where to print */
+
+/*********************************************************************//**
+Return the number of table locks for a transaction.
+The caller must be holding lock_sys.latch. */
+ulint
+lock_number_of_tables_locked(
+/*=========================*/
+	const trx_lock_t*	trx_lock)	/*!< in: transaction locks */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Check if there are any locks on a table.
+@return true if table has either table or record locks. */
+bool lock_table_has_locks(dict_table_t *table);
+
+/** Wait for a lock to be released.
+@retval DB_DEADLOCK if this transaction was chosen as the deadlock victim
+@retval DB_INTERRUPTED if the execution was interrupted by the user
+@retval DB_LOCK_WAIT_TIMEOUT if the lock wait timed out
+@retval DB_SUCCESS if the lock was granted */
+dberr_t lock_wait(que_thr_t *thr);
+/*********************************************************************//**
+Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
+function should be called at the the end of an SQL statement, by the
+connection thread that owns the transaction (trx->mysql_thd). */
+void
+lock_unlock_table_autoinc(
+/*======================*/
+	trx_t*	trx);			/*!< in/out: transaction */
+
+/** Handle a pending lock wait (DB_LOCK_WAIT) in a semi-consistent read
+while holding a clustered index leaf page latch.
+@param trx           transaction that is or was waiting for a lock
+@retval DB_SUCCESS   if the lock was granted
+@retval DB_DEADLOCK  if the transaction must be aborted due to a deadlock
+@retval DB_LOCK_WAIT if a lock wait would be necessary; the pending
+                     lock request was released */
+dberr_t lock_trx_handle_wait(trx_t *trx);
+
+/*********************************************************************//**
+Checks that a transaction id is sensible, i.e., not in the future.
+@return true if ok */
+bool
+lock_check_trx_id_sanity(
+/*=====================*/
+	trx_id_t	trx_id,		/*!< in: trx id */
+	const rec_t*	rec,		/*!< in: user record */
+	dict_index_t*	index,		/*!< in: index */
+	const rec_offs*	offsets);	/*!< in: rec_get_offsets(rec, index) */
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Check if the transaction holds any locks on the sys tables
+or its records.
+@return the strongest lock found on any sys table or 0 for none */
+const lock_t*
+lock_trx_has_sys_table_locks(
+/*=========================*/
+	const trx_t*	trx)	/*!< in: transaction to check */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Check if the transaction holds an explicit exclusive lock on a record.
+@param[in]	trx	transaction
+@param[in]	table	table
+@param[in]	id	leaf page identifier
+@param[in]	heap_no	heap number identifying the record
+@return whether an explicit X-lock is held */
+bool lock_trx_has_expl_x_lock(const trx_t &trx, const dict_table_t &table,
+                              page_id_t id, ulint heap_no);
+#endif /* UNIV_DEBUG */
+
+/** Lock operation struct */
+struct lock_op_t{
+	dict_table_t*	table;	/*!< table to be locked */
+	lock_mode	mode;	/*!< lock mode */
+};
+
+/** The lock system struct */
+class lock_sys_t
+{
+  friend struct LockGuard;
+  friend struct LockMultiGuard;
+  friend struct TMLockGuard;
+  friend struct TMLockMutexGuard;
+  friend struct TMLockTrxGuard;
+
+  /** Hash table latch */
+  struct hash_latch
+#ifdef SUX_LOCK_GENERIC
+  : private rw_lock
+  {
+    /** Wait for an exclusive lock */
+    void wait();
+    /** Try to acquire a lock */
+    bool try_acquire() { return write_trylock(); }
+    /** Acquire a lock */
+    void acquire() { if (!try_acquire()) wait(); }
+    /** Release a lock */
+    void release();
+    /** @return whether any lock is being held or waited for by any thread */
+    bool is_locked_or_waiting() const
+    { return rw_lock::is_locked_or_waiting(); }
+    /** @return whether this latch is possibly held by any thread */
+    bool is_locked() const { return rw_lock::is_locked(); }
+#else
+  {
+  private:
+    srw_spin_lock_low lock;
+  public:
+    /** Try to acquire a lock */
+    bool try_acquire() { return lock.wr_lock_try(); }
+    /** Acquire a lock */
+    void acquire() { lock.wr_lock(); }
+    /** Release a lock */
+    void release() { lock.wr_unlock(); }
+    /** @return whether any lock may be held by any thread */
+    bool is_locked_or_waiting() const noexcept
+    { return lock.is_locked_or_waiting(); }
+    /** @return whether this latch is possibly held by any thread */
+    bool is_locked() const noexcept { return lock.is_locked(); }
+#endif
+  };
+
+public:
+  struct hash_table
+  {
+    /** Number of consecutive array[] elements occupied by a hash_latch */
+    static constexpr size_t LATCH= sizeof(void*) >= sizeof(hash_latch) ? 1 : 2;
+    static_assert(sizeof(hash_latch) <= LATCH * sizeof(void*), "allocation");
+
+    /** Number of array[] elements per hash_latch.
+    Must be LATCH less than a power of 2. */
+    static constexpr size_t ELEMENTS_PER_LATCH= (64 / sizeof(void*)) - LATCH;
+    static constexpr size_t EMPTY_SLOTS_PER_LATCH=
+      ((CPU_LEVEL1_DCACHE_LINESIZE / 64) - 1) * (64 / sizeof(void*));
+
+    /** number of payload elements in array[]. Protected by lock_sys.latch. */
+    ulint n_cells;
+    /** the hash table, with pad(n_cells) elements, aligned to L1 cache size;
+    in any hash chain, lock_t::is_waiting() entries must not precede
+    granted locks */
+    hash_cell_t *array;
+
+    /** Create the hash table.
+    @param n  the lower bound of n_cells */
+    void create(ulint n);
+
+    /** Resize the hash table.
+    @param n  the lower bound of n_cells */
+    void resize(ulint n);
+
+    /** Free the hash table. */
+    void free() { aligned_free(array); array= nullptr; }
+
+    /** @return the index of an array element */
+    inline ulint calc_hash(ulint fold) const;
+
+    /** @return raw array index converted to padded index */
+    static ulint pad(ulint h)
+    {
+      ulint latches= LATCH * (h / ELEMENTS_PER_LATCH);
+      ulint empty_slots= (h / ELEMENTS_PER_LATCH) * EMPTY_SLOTS_PER_LATCH;
+      return LATCH + latches + empty_slots + h;
+    }
+
+    /** Get a latch. */
+    static hash_latch *latch(hash_cell_t *cell)
+    {
+      void *l= ut_align_down(cell, sizeof *cell *
+                             (ELEMENTS_PER_LATCH + LATCH));
+      return static_cast<hash_latch*>(l);
+    }
+    /** Get a hash table cell. */
+    inline hash_cell_t *cell_get(ulint fold) const;
+
+#ifdef UNIV_DEBUG
+    void assert_locked(const page_id_t id) const;
+#else
+    void assert_locked(const page_id_t) const {}
+#endif
+
+  private:
+    /** @return the hash value before any ELEMENTS_PER_LATCH padding */
+    static ulint hash(ulint fold, ulint n) { return ut_hash_ulint(fold, n); }
+
+    /** @return the index of an array element */
+    static ulint calc_hash(ulint fold, ulint n_cells)
+    {
+      return pad(hash(fold, n_cells));
+    }
+  };
+
+private:
+  bool m_initialised;
+
+  /** mutex proteting the locks */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_spin_lock latch;
+#ifdef UNIV_DEBUG
+  /** The owner of exclusive latch (0 if none); protected by latch */
+  std::atomic<pthread_t> writer{0};
+  /** Number of shared latches */
+  std::atomic<ulint> readers{0};
+#endif
+#ifdef SUX_LOCK_GENERIC
+protected:
+  /** mutex for hash_latch::wait() */
+  pthread_mutex_t hash_mutex;
+  /** condition variable for hash_latch::wait() */
+  pthread_cond_t hash_cond;
+#endif
+public:
+  /** record locks */
+  hash_table rec_hash;
+  /** predicate locks for SPATIAL INDEX */
+  hash_table prdt_hash;
+  /** page locks for SPATIAL INDEX */
+  hash_table prdt_page_hash;
+
+  /** mutex covering lock waits; @see trx_lock_t::wait_lock */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t wait_mutex;
+private:
+  /** The increment of wait_count for a wait. Anything smaller is a
+  pending wait count. */
+  static constexpr uint64_t WAIT_COUNT_STEP= 1U << 19;
+  /** waits and total number of lock waits; protected by wait_mutex */
+  uint64_t wait_count;
+  /** Cumulative wait time; protected by wait_mutex */
+  uint64_t wait_time;
+  /** Longest wait time; protected by wait_mutex */
+  uint64_t wait_time_max;
+public:
+  /** number of deadlocks detected; protected by wait_mutex */
+  ulint deadlocks;
+  /** number of lock wait timeouts; protected by wait_mutex */
+  ulint timeouts;
+  /**
+    Constructor.
+
+    Some members may require late initialisation, thus we just mark object as
+    uninitialised. Real initialisation happens in create().
+  */
+  lock_sys_t(): m_initialised(false) {}
+
+
+  bool is_initialised() const { return m_initialised; }
+
+#ifdef UNIV_PFS_RWLOCK
+  /** Acquire exclusive lock_sys.latch */
+  ATTRIBUTE_NOINLINE
+  void wr_lock(const char *file, unsigned line);
+  /** Release exclusive lock_sys.latch */
+  ATTRIBUTE_NOINLINE void wr_unlock();
+  /** Acquire shared lock_sys.latch */
+  ATTRIBUTE_NOINLINE void rd_lock(const char *file, unsigned line);
+  /** Release shared lock_sys.latch */
+  ATTRIBUTE_NOINLINE void rd_unlock();
+#else
+  /** Acquire exclusive lock_sys.latch */
+  void wr_lock()
+  {
+    mysql_mutex_assert_not_owner(&wait_mutex);
+    ut_ad(!is_writer());
+    latch.wr_lock();
+    ut_ad(!writer.exchange(pthread_self(),
+                           std::memory_order_relaxed));
+  }
+  /** Release exclusive lock_sys.latch */
+  void wr_unlock()
+  {
+    ut_ad(writer.exchange(0, std::memory_order_relaxed) ==
+          pthread_self());
+    latch.wr_unlock();
+  }
+  /** Acquire shared lock_sys.latch */
+  void rd_lock()
+  {
+    mysql_mutex_assert_not_owner(&wait_mutex);
+    ut_ad(!is_writer());
+    latch.rd_lock();
+    ut_ad(!writer.load(std::memory_order_relaxed));
+    ut_d(readers.fetch_add(1, std::memory_order_relaxed));
+  }
+  /** Release shared lock_sys.latch */
+  void rd_unlock()
+  {
+    ut_ad(!is_writer());
+    ut_ad(readers.fetch_sub(1, std::memory_order_relaxed));
+    latch.rd_unlock();
+  }
+#endif
+  /** Try to acquire exclusive lock_sys.latch
+  @return whether the latch was acquired */
+  bool wr_lock_try()
+  {
+    ut_ad(!is_writer());
+    if (!latch.wr_lock_try()) return false;
+    ut_ad(!writer.exchange(pthread_self(),
+                           std::memory_order_relaxed));
+    return true;
+  }
+  /** Try to acquire shared lock_sys.latch
+  @return whether the latch was acquired */
+  bool rd_lock_try()
+  {
+    ut_ad(!is_writer());
+    if (!latch.rd_lock_try()) return false;
+    ut_ad(!writer.load(std::memory_order_relaxed));
+    ut_d(readers.fetch_add(1, std::memory_order_relaxed));
+    return true;
+  }
+
+  /** Assert that wr_lock() has been invoked by this thread */
+  void assert_locked() const { ut_ad(is_writer()); }
+  /** Assert that wr_lock() has not been invoked by this thread */
+  void assert_unlocked() const { ut_ad(!is_writer()); }
+#ifdef UNIV_DEBUG
+  /** @return whether the current thread is the lock_sys.latch writer */
+  bool is_writer() const
+  {
+# ifdef SUX_LOCK_GENERIC
+    return writer.load(std::memory_order_relaxed) == pthread_self();
+# else
+    return writer.load(std::memory_order_relaxed) == pthread_self() ||
+      (xtest() && !latch.is_locked_or_waiting());
+# endif
+  }
+  /** Assert that a lock shard is exclusively latched (by some thread) */
+  void assert_locked(const lock_t &lock) const;
+  /** Assert that a table lock shard is exclusively latched by this thread */
+  void assert_locked(const dict_table_t &table) const;
+  /** Assert that a hash table cell is exclusively latched (by some thread) */
+  void assert_locked(const hash_cell_t &cell) const;
+#else
+  void assert_locked(const lock_t &) const {}
+  void assert_locked(const dict_table_t &) const {}
+  void assert_locked(const hash_cell_t &) const {}
+#endif
+
+  /**
+    Creates the lock system at database start.
+
+    @param[in] n_cells number of slots in lock hash table
+  */
+  void create(ulint n_cells);
+
+
+  /**
+    Resize the lock hash table.
+
+    @param[in] n_cells number of slots in lock hash table
+  */
+  void resize(ulint n_cells);
+
+
+  /** Closes the lock system at database shutdown. */
+  void close();
+
+
+  /** Check for deadlocks while holding only lock_sys.wait_mutex. */
+  void deadlock_check();
+
+  /** Cancel a waiting lock request.
+  @tparam check_victim  whether to check for DB_DEADLOCK
+  @param trx            active transaction
+  @param lock           waiting lock request
+  @retval DB_SUCCESS    if no lock existed
+  @retval DB_DEADLOCK   if trx->lock.was_chosen_as_deadlock_victim was set
+  @retval DB_LOCK_WAIT  if the lock was canceled */
+  template<bool check_victim>
+  static dberr_t cancel(trx_t *trx, lock_t *lock);
+
+  /** Note that a record lock wait started */
+  inline void wait_start();
+
+  /** Note that a record lock wait resumed */
+  inline void wait_resume(THD *thd, my_hrtime_t start, my_hrtime_t now);
+
+  /** @return pending number of lock waits */
+  ulint get_wait_pending() const
+  {
+    return static_cast<ulint>(wait_count & (WAIT_COUNT_STEP - 1));
+  }
+  /** @return cumulative number of lock waits */
+  ulint get_wait_cumulative() const
+  { return static_cast<ulint>(wait_count / WAIT_COUNT_STEP); }
+  /** Cumulative wait time; protected by wait_mutex */
+  uint64_t get_wait_time_cumulative() const { return wait_time; }
+  /** Longest wait time; protected by wait_mutex */
+  uint64_t get_wait_time_max() const { return wait_time_max; }
+
+  /** Get the lock hash table for a mode */
+  hash_table &hash_get(ulint mode)
+  {
+    if (UNIV_LIKELY(!(mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE))))
+      return rec_hash;
+    return (mode & LOCK_PREDICATE) ? prdt_hash : prdt_page_hash;
+  }
+
+  /** Get the lock hash table for predicate a mode */
+  hash_table &prdt_hash_get(bool page)
+  { return page ? prdt_page_hash : prdt_hash; }
+
+  /** Get the first lock on a page.
+  @param cell        hash table cell
+  @param id          page number
+  @return first lock
+  @retval nullptr if none exists */
+  static inline lock_t *get_first(const hash_cell_t &cell, page_id_t id);
+
+  /** Get the first explicit lock request on a record.
+  @param cell     first lock hash table cell
+  @param id       page identifier
+  @param heap_no  record identifier in page
+  @return first lock
+  @retval nullptr if none exists */
+  static inline lock_t *get_first(const hash_cell_t &cell, page_id_t id,
+                                  ulint heap_no);
+
+  /** Remove locks on a discarded SPATIAL INDEX page.
+  @param id   page to be discarded
+  @param page whether to discard also from lock_sys.prdt_hash */
+  void prdt_page_free_from_discard(const page_id_t id, bool all= false);
+
+  /** Cancel possible lock waiting for a transaction */
+  static void cancel_lock_wait_for_trx(trx_t *trx);
+#ifdef WITH_WSREP
+  /** Cancel lock waiting for a wsrep BF abort. */
+  static void cancel_lock_wait_for_wsrep_bf_abort(trx_t *trx);
+#endif /* WITH_WSREP */
+};
+
+/** The lock system */
+extern lock_sys_t lock_sys;
+
+/** @return the index of an array element */
+inline ulint lock_sys_t::hash_table::calc_hash(ulint fold) const
+{
+  ut_ad(lock_sys.is_writer() || lock_sys.readers);
+  return calc_hash(fold, n_cells);
+}
+
+/** Get a hash table cell. */
+inline hash_cell_t *lock_sys_t::hash_table::cell_get(ulint fold) const
+{
+  ut_ad(lock_sys.is_writer() || lock_sys.readers);
+  return &array[calc_hash(fold)];
+}
+
+/** Get the first lock on a page.
+@param cell        hash table cell
+@param id          page number
+@return first lock
+@retval nullptr if none exists */
+inline lock_t *lock_sys_t::get_first(const hash_cell_t &cell, page_id_t id)
+{
+  lock_sys.assert_locked(cell);
+  for (auto lock= static_cast<lock_t*>(cell.node); lock; lock= lock->hash)
+  {
+    ut_ad(!lock->is_table());
+    if (lock->un_member.rec_lock.page_id == id)
+      return lock;
+  }
+  return nullptr;
+}
+
+/** lock_sys.latch exclusive guard */
+struct LockMutexGuard
+{
+  LockMutexGuard(SRW_LOCK_ARGS(const char *file, unsigned line))
+  { lock_sys.wr_lock(SRW_LOCK_ARGS(file, line)); }
+  ~LockMutexGuard() { lock_sys.wr_unlock(); }
+};
+
+/** lock_sys latch guard for 1 page_id_t */
+struct LockGuard
+{
+  LockGuard(lock_sys_t::hash_table &hash, const page_id_t id);
+  ~LockGuard()
+  {
+    lock_sys_t::hash_table::latch(cell_)->release();
+    /* Must be last, to avoid a race with lock_sys_t::hash_table::resize() */
+    lock_sys.rd_unlock();
+  }
+  /** @return the hash array cell */
+  hash_cell_t &cell() const { return *cell_; }
+private:
+  /** The hash array cell */
+  hash_cell_t *cell_;
+};
+
+/** lock_sys latch guard for 2 page_id_t */
+struct LockMultiGuard
+{
+  LockMultiGuard(lock_sys_t::hash_table &hash,
+                 const page_id_t id1, const page_id_t id2);
+  ~LockMultiGuard();
+
+  /** @return the first hash array cell */
+  hash_cell_t &cell1() const { return *cell1_; }
+  /** @return the second hash array cell */
+  hash_cell_t &cell2() const { return *cell2_; }
+private:
+  /** The first hash array cell */
+  hash_cell_t *cell1_;
+  /** The second hash array cell */
+  hash_cell_t *cell2_;
+};
+
+/** lock_sys.latch exclusive guard using transactional memory */
+struct TMLockMutexGuard
+{
+  TRANSACTIONAL_INLINE
+  TMLockMutexGuard(SRW_LOCK_ARGS(const char *file, unsigned line))
+  {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+    if (xbegin())
+    {
+      if (was_elided())
+        return;
+      xabort();
+    }
+#endif
+    lock_sys.wr_lock(SRW_LOCK_ARGS(file, line));
+  }
+  TRANSACTIONAL_INLINE
+  ~TMLockMutexGuard()
+  {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+    if (was_elided()) xend(); else
+#endif
+    lock_sys.wr_unlock();
+  }
+
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  bool was_elided() const noexcept
+  { return !lock_sys.latch.is_locked_or_waiting(); }
+#else
+  bool was_elided() const noexcept { return false; }
+#endif
+};
+
+/** lock_sys latch guard for 1 page_id_t, using transactional memory */
+struct TMLockGuard
+{
+  TRANSACTIONAL_TARGET
+  TMLockGuard(lock_sys_t::hash_table &hash, const page_id_t id);
+  TRANSACTIONAL_INLINE ~TMLockGuard()
+  {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+    if (elided)
+    {
+      xend();
+      return;
+    }
+#endif
+    lock_sys_t::hash_table::latch(cell_)->release();
+    /* Must be last, to avoid a race with lock_sys_t::hash_table::resize() */
+    lock_sys.rd_unlock();
+  }
+  /** @return the hash array cell */
+  hash_cell_t &cell() const { return *cell_; }
+private:
+  /** The hash array cell */
+  hash_cell_t *cell_;
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  /** whether the latches were elided */
+  bool elided;
+#endif
+};
+
+/** guard for shared lock_sys.latch and trx_t::mutex using
+transactional memory */
+struct TMLockTrxGuard
+{
+  trx_t &trx;
+
+  TRANSACTIONAL_INLINE
+#ifndef UNIV_PFS_RWLOCK
+  TMLockTrxGuard(trx_t &trx) : trx(trx)
+# define TMLockTrxArgs(trx) trx
+#else
+  TMLockTrxGuard(const char *file, unsigned line, trx_t &trx) : trx(trx)
+# define TMLockTrxArgs(trx) SRW_LOCK_CALL, trx
+#endif
+  {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+    if (xbegin())
+    {
+      if (!lock_sys.latch.is_write_locked() && was_elided())
+        return;
+      xabort();
+    }
+#endif
+    lock_sys.rd_lock(SRW_LOCK_ARGS(file, line));
+    trx.mutex_lock();
+  }
+  TRANSACTIONAL_INLINE
+  ~TMLockTrxGuard()
+  {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+    if (was_elided())
+    {
+      xend();
+      return;
+    }
+#endif
+    lock_sys.rd_unlock();
+    trx.mutex_unlock();
+  }
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  bool was_elided() const noexcept { return !trx.mutex_is_locked(); }
+#else
+  bool was_elided() const noexcept { return false; }
+#endif
+};
+
+/** guard for trx_t::mutex using transactional memory */
+struct TMTrxGuard
+{
+  trx_t &trx;
+
+  TRANSACTIONAL_INLINE TMTrxGuard(trx_t &trx) : trx(trx)
+  {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+    if (xbegin())
+    {
+      if (was_elided())
+        return;
+      xabort();
+    }
+#endif
+    trx.mutex_lock();
+  }
+  TRANSACTIONAL_INLINE ~TMTrxGuard()
+  {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+    if (was_elided())
+    {
+      xend();
+      return;
+    }
+#endif
+    trx.mutex_unlock();
+  }
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  bool was_elided() const noexcept { return !trx.mutex_is_locked(); }
+#else
+  bool was_elided() const noexcept { return false; }
+#endif
+};
+
+/*********************************************************************//**
+Creates a new record lock and inserts it to the lock queue. Does NOT check
+for deadlocks or lock compatibility!
+@return created lock */
+UNIV_INLINE
+lock_t*
+lock_rec_create(
+/*============*/
+	lock_t*			c_lock,	/*!< conflicting lock */
+	unsigned		type_mode,/*!< in: lock mode and wait flag */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of the record */
+	dict_index_t*		index,	/*!< in: index of record */
+	trx_t*			trx,	/*!< in,out: transaction */
+	bool			caller_owns_trx_mutex);
+					/*!< in: true if caller owns
+					trx mutex */
+
+/** Remove a record lock request, waiting or granted, on a discarded page
+@param hash     hash table
+@param in_lock  lock object */
+void lock_rec_discard(lock_sys_t::hash_table &lock_hash, lock_t *in_lock);
+
+/** Create a new record lock and inserts it to the lock queue,
+without checking for deadlocks or conflicts.
+@param[in]	c_lock		conflicting lock, or NULL
+@param[in]	type_mode	lock mode and wait flag
+@param[in]	page_id		index page number
+@param[in]	page		R-tree index page, or NULL
+@param[in]	heap_no		record heap number in the index page
+@param[in]	index		the index tree
+@param[in,out]	trx		transaction
+@param[in]	holds_trx_mutex	whether the caller holds trx->mutex
+@return created lock */
+lock_t*
+lock_rec_create_low(
+	lock_t*		c_lock,
+	unsigned	type_mode,
+	const page_id_t	page_id,
+	const page_t*	page,
+	ulint		heap_no,
+	dict_index_t*	index,
+	trx_t*		trx,
+	bool		holds_trx_mutex);
+
+/** Enqueue a waiting request for a lock which cannot be granted immediately.
+Check for deadlocks.
+@param[in]	c_lock		conflicting lock
+@param[in]	type_mode	the requested lock mode (LOCK_S or LOCK_X)
+				possibly ORed with LOCK_GAP or
+				LOCK_REC_NOT_GAP, ORed with
+				LOCK_INSERT_INTENTION if this
+				waiting lock request is set
+				when performing an insert of
+				an index record
+@param[in]	id		page identifier
+@param[in]	page		leaf page in the index
+@param[in]	heap_no		record heap number in the block
+@param[in]	index		index tree
+@param[in,out]	thr		query thread
+@param[in]	prdt		minimum bounding box (spatial index)
+@retval	DB_LOCK_WAIT		if the waiting lock was enqueued
+@retval	DB_DEADLOCK		if this transaction was chosen as the victim */
+dberr_t
+lock_rec_enqueue_waiting(
+	lock_t*			c_lock,
+	unsigned		type_mode,
+	const page_id_t		id,
+	const page_t*		page,
+	ulint			heap_no,
+	dict_index_t*		index,
+	que_thr_t*		thr,
+	lock_prdt_t*		prdt);
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list start is moved to another page. */
+void
+lock_rtr_move_rec_list(
+/*===================*/
+	const buf_block_t*	new_block,	/*!< in: index page to
+						move to */
+	const buf_block_t*	block,		/*!< in: index page */
+	rtr_rec_move_t*		rec_move,	/*!< in: recording records
+						moved */
+	ulint			num_move);	/*!< in: num of rec to move */
+
+#include "lock0lock.inl"
+
+#endif
diff --git a/storage/innobase/include/lock0lock.inl b/storage/innobase/include/lock0lock.inl
new file mode 100644
index 00000000..1b9255ff
--- /dev/null
+++ b/storage/innobase/include/lock0lock.inl
@@ -0,0 +1,78 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0lock.ic
+The transaction lock system
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0dict.h"
+#include "buf0buf.h"
+#include "page0page.h"
+
+/*********************************************************************//**
+Gets the heap_no of the smallest user record on a page.
+@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
+UNIV_INLINE
+ulint
+lock_get_min_heap_no(
+/*=================*/
+	const buf_block_t*	block)	/*!< in: buffer block */
+{
+	const page_t*	page	= block->page.frame;
+
+	if (page_is_comp(page)) {
+		return(rec_get_heap_no_new(
+			       page
+			       + rec_get_next_offs(page + PAGE_NEW_INFIMUM,
+						   TRUE)));
+	} else {
+		return(rec_get_heap_no_old(
+			       page
+			       + rec_get_next_offs(page + PAGE_OLD_INFIMUM,
+						   FALSE)));
+	}
+}
+
+/*********************************************************************//**
+Creates a new record lock and inserts it to the lock queue. Does NOT check
+for deadlocks or lock compatibility!
+@return created lock */
+UNIV_INLINE
+lock_t*
+lock_rec_create(
+/*============*/
+	lock_t*			c_lock,	/*!< conflicting lock */
+	unsigned		type_mode,/*!< in: lock mode and wait flag */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of the record */
+	dict_index_t*		index,	/*!< in: index of record */
+	trx_t*			trx,	/*!< in,out: transaction */
+	bool			caller_owns_trx_mutex)
+					/*!< in: TRUE if caller owns
+					trx mutex */
+{
+	return lock_rec_create_low(
+		c_lock,
+		type_mode, block->page.id(), block->page.frame, heap_no,
+		index, trx, caller_owns_trx_mutex);
+}
diff --git a/storage/innobase/include/lock0prdt.h b/storage/innobase/include/lock0prdt.h
new file mode 100644
index 00000000..db8e3392
--- /dev/null
+++ b/storage/innobase/include/lock0prdt.h
@@ -0,0 +1,192 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0prdt.h
+The predicate lock system
+
+Created 9/7/2013 Jimmy Yang
+*******************************************************/
+#ifndef lock0prdt_h
+#define lock0prdt_h
+
+#include "lock0lock.h"
+
+/* Predicate lock data */
+typedef struct lock_prdt {
+	void*		data;		/* Predicate data */
+	uint16		op;		/* Predicate operator */
+} lock_prdt_t;
+
+/*********************************************************************//**
+Acquire a predicate lock on a block
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_prdt_lock(
+/*===========*/
+	buf_block_t*	block,	/*!< in/out: buffer block of rec */
+	lock_prdt_t*	prdt,	/*!< in: Predicate for the lock */
+	dict_index_t*	index,	/*!< in: secondary index */
+	enum lock_mode	mode,	/*!< in: mode of the lock which
+				the read cursor should set on
+				records: LOCK_S or LOCK_X; the
+				latter is possible in
+				SELECT FOR UPDATE */
+	unsigned	type_mode,
+				/*!< in: LOCK_PREDICATE or LOCK_PRDT_PAGE */
+	que_thr_t*	thr);	/*!< in: query thread
+				(can be NULL if BTR_NO_LOCKING_FLAG) */
+
+/*********************************************************************//**
+Acquire a "Page" lock on a block
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_place_prdt_page_lock(
+	const page_id_t	page_id,	/*!< in: page identifier */
+	dict_index_t*	index,	/*!< in: secondary index */
+	que_thr_t*	thr);	/*!< in: query thread */
+
+/*********************************************************************//**
+Initiate a Predicate lock from a MBR */
+void
+lock_init_prdt_from_mbr(
+/*====================*/
+	lock_prdt_t*	prdt,	/*!< in/out: predicate to initialized */
+	rtr_mbr_t*	mbr,	/*!< in: Minimum Bounding Rectangle */
+	ulint		mode,	/*!< in: Search mode */
+	mem_heap_t*	heap);	/*!< in: heap for allocating memory */
+
+/*********************************************************************//**
+Get predicate lock's minimum bounding box
+@return the minimum bounding box*/
+lock_prdt_t*
+lock_get_prdt_from_lock(
+/*====================*/
+	const lock_t*	lock);	/*!< in: the lock */
+
+/*********************************************************************//**
+Checks if a predicate lock request for a new lock has to wait for
+request lock2.
+@return true if new lock has to wait for lock2 to be removed */
+bool
+lock_prdt_has_to_wait(
+/*==================*/
+	const trx_t*	trx,	/*!< in: trx of new lock */
+	unsigned	type_mode,/*!< in: precise mode of the new lock
+				to set: LOCK_S or LOCK_X, possibly
+				ORed to LOCK_PREDICATE or LOCK_PRDT_PAGE,
+				LOCK_INSERT_INTENTION */
+	lock_prdt_t*	prdt,	/*!< in: lock predicate to check */
+	const lock_t*	lock2);	/*!< in: another record lock; NOTE that
+				it is assumed that this has a lock bit
+				set on the same record as in the new
+				lock we are setting */
+
+/**************************************************************//**
+Update predicate lock when page splits */
+void
+lock_prdt_update_split(
+/*===================*/
+	buf_block_t*	new_block,	/*!< in/out: the new half page */
+	lock_prdt_t*	prdt,		/*!< in: MBR on the old page */
+	lock_prdt_t*	new_prdt,	/*!< in: MBR on the new page */
+	const page_id_t	page_id);	/*!< in: page number */
+
+/**************************************************************//**
+Ajust locks from an ancester page of Rtree on the appropriate level . */
+void
+lock_prdt_update_parent(
+/*====================*/
+	buf_block_t*	left_block,	/*!< in/out: page to be split */
+	buf_block_t*	right_block,	/*!< in/out: the new half page */
+	lock_prdt_t*	left_prdt,	/*!< in: MBR on the old page */
+	lock_prdt_t*	right_prdt,	/*!< in: MBR on the new page */
+	const page_id_t	page_id);	/*!< in: parent page */
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate insert of
+a predicate record.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_prdt_insert_check_and_lock(
+/*============================*/
+	const rec_t*	rec,	/*!< in: record after which to insert */
+	buf_block_t*	block,	/*!< in/out: buffer block of rec */
+	dict_index_t*	index,	/*!< in: index */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	lock_prdt_t*	prdt);	/*!< in: Minimum Bound Rectangle */
+
+/*********************************************************************//**
+Append a predicate to the lock */
+void
+lock_prdt_set_prdt(
+/*===============*/
+	lock_t*			lock,	/*!< in: lock */
+	const lock_prdt_t*	prdt);	/*!< in: Predicate */
+
+#if 0
+
+/*********************************************************************//**
+Checks if a predicate lock request for a new lock has to wait for
+request lock2.
+@return true if new lock has to wait for lock2 to be removed */
+UNIV_INLINE
+bool
+lock_prdt_has_to_wait(
+/*==================*/
+	const trx_t*	trx,	/*!< in: trx of new lock */
+	unsigned	type_mode,/*!< in: precise mode of the new lock
+				to set: LOCK_S or LOCK_X, possibly
+				ORed to LOCK_PREDICATE or LOCK_PRDT_PAGE,
+				LOCK_INSERT_INTENTION */
+	lock_prdt_t*	prdt,	/*!< in: lock predicate to check */
+	const lock_t*	lock2);	/*!< in: another record lock; NOTE that
+				it is assumed that this has a lock bit
+				set on the same record as in the new
+				lock we are setting */
+
+/*********************************************************************//**
+Get predicate lock's minimum bounding box
+@return the minimum bounding box*/
+UNIV_INLINE
+rtr_mbr_t*
+prdt_get_mbr_from_prdt(
+/*===================*/
+	const lock_prdt_t*	prdt);	/*!< in: the lock predicate */
+
+
+#endif
+/*************************************************************//**
+Moves the locks of a record to another record and resets the lock bits of
+the donating record. */
+void
+lock_prdt_rec_move(
+/*===============*/
+	const buf_block_t*	receiver,	/*!< in: buffer block containing
+						the receiving record */
+	const page_id_t		donator);	/*!< in: target page */
+
+/** Check whether there are R-tree Page lock on a page
+@param[in]	trx	trx to test the lock
+@param[in]	page_id	page identifier
+@return	true if there is none */
+bool lock_test_prdt_page_lock(const trx_t *trx, const page_id_t page_id);
+
+#endif
diff --git a/storage/innobase/include/lock0priv.h b/storage/innobase/include/lock0priv.h
new file mode 100644
index 00000000..b0a5f7aa
--- /dev/null
+++ b/storage/innobase/include/lock0priv.h
@@ -0,0 +1,582 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0priv.h
+Lock module internal structures and methods.
+
+Created July 12, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef lock0priv_h
+#define lock0priv_h
+
+#ifndef LOCK_MODULE_IMPLEMENTATION
+/* If you need to access members of the structures defined in this
+file, please write appropriate functions that retrieve them and put
+those functions in lock/ */
+#error Do not include lock0priv.h outside of the lock/ module
+#endif
+
+#include "hash0hash.h"
+#include "rem0types.h"
+#include "trx0trx.h"
+
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+/** Print the table lock into the given output stream
+@param[in,out]	out	the output stream
+@return the given output stream. */
+inline
+std::ostream& lock_table_t::print(std::ostream& out) const
+{
+	out << "[lock_table_t: name=" << table->name << "]";
+	return(out);
+}
+
+/** The global output operator is overloaded to conveniently
+print the lock_table_t object into the given output stream.
+@param[in,out]	out	the output stream
+@param[in]	lock	the table lock
+@return the given output stream */
+inline
+std::ostream&
+operator<<(std::ostream& out, const lock_table_t& lock)
+{
+	return(lock.print(out));
+}
+
+inline
+std::ostream&
+ib_lock_t::print(std::ostream& out) const
+{
+  static_assert(LOCK_MODE_MASK == 7, "compatibility");
+  static_assert(LOCK_IS == 0, "compatibility");
+  static_assert(LOCK_IX == 1, "compatibility");
+  static_assert(LOCK_S == 2, "compatibility");
+  static_assert(LOCK_X == 3, "compatibility");
+  static_assert(LOCK_AUTO_INC == 4, "compatibility");
+  static_assert(LOCK_NONE == 5, "compatibility");
+  static_assert(LOCK_NONE_UNSET == 7, "compatibility");
+  const char *const modes[8]=
+  { "IS", "IX", "S", "X", "AUTO_INC", "NONE", "?", "NONE_UNSET" };
+
+  out << "[lock_t: type_mode=" << type_mode << "(" << type_string()
+      << " | LOCK_" << modes[mode()];
+
+  if (is_record_not_gap())
+    out << " | LOCK_REC_NOT_GAP";
+  if (is_waiting())
+    out << " | LOCK_WAIT";
+
+  if (is_gap())
+    out << " | LOCK_GAP";
+
+  if (is_insert_intention())
+    out << " | LOCK_INSERT_INTENTION";
+
+  out << ")";
+
+  if (is_table())
+    out << un_member.tab_lock;
+  else
+    out << un_member.rec_lock;
+
+  out << "]";
+  return out;
+}
+
+inline
+std::ostream&
+operator<<(std::ostream& out, const ib_lock_t& lock)
+{
+	return(lock.print(out));
+}
+
+#ifdef UNIV_DEBUG
+extern ibool	lock_print_waits;
+#endif /* UNIV_DEBUG */
+
+/* An explicit record lock affects both the record and the gap before it.
+An implicit x-lock does not affect the gap, it only locks the index
+record from read or update.
+
+If a transaction has modified or inserted an index record, then
+it owns an implicit x-lock on the record. On a secondary index record,
+a transaction has an implicit x-lock also if it has modified the
+clustered index record, the max trx id of the page where the secondary
+index record resides is >= trx id of the transaction (or database recovery
+is running), and there are no explicit non-gap lock requests on the
+secondary index record.
+
+This complicated definition for a secondary index comes from the
+implementation: we want to be able to determine if a secondary index
+record has an implicit x-lock, just by looking at the present clustered
+index record, not at the historical versions of the record. The
+complicated definition can be explained to the user so that there is
+nondeterminism in the access path when a query is answered: we may,
+or may not, access the clustered index record and thus may, or may not,
+bump into an x-lock set there.
+
+Different transaction can have conflicting locks set on the gap at the
+same time. The locks on the gap are purely inhibitive: an insert cannot
+be made, or a select cursor may have to wait if a different transaction
+has a conflicting lock on the gap. An x-lock on the gap does not give
+the right to insert into the gap.
+
+An explicit lock can be placed on a user record or the supremum record of
+a page. The locks on the supremum record are always thought to be of the gap
+type, though the gap bit is not set. When we perform an update of a record
+where the size of the record changes, we may temporarily store its explicit
+locks on the infimum record of the page, though the infimum otherwise never
+carries locks.
+
+A waiting record lock can also be of the gap type. A waiting lock request
+can be granted when there is no conflicting mode lock request by another
+transaction ahead of it in the explicit lock queue.
+
+In version 4.0.5 we added yet another explicit lock type: LOCK_REC_NOT_GAP.
+It only locks the record it is placed on, not the gap before the record.
+This lock type is necessary to emulate an Oracle-like READ COMMITTED isolation
+level.
+
+-------------------------------------------------------------------------
+RULE 1: If there is an implicit x-lock on a record, and there are non-gap
+-------
+lock requests waiting in the queue, then the transaction holding the implicit
+x-lock also has an explicit non-gap record x-lock. Therefore, as locks are
+released, we can grant locks to waiting lock requests purely by looking at
+the explicit lock requests in the queue.
+
+RULE 3: Different transactions cannot have conflicting granted non-gap locks
+-------
+on a record at the same time. However, they can have conflicting granted gap
+locks.
+RULE 4: If a there is a waiting lock request in a queue, no lock request,
+-------
+gap or not, can be inserted ahead of it in the queue. In record deletes
+and page splits new gap type locks can be created by the database manager
+for a transaction, and without rule 4, the waits-for graph of transactions
+might become cyclic without the database noticing it, as the deadlock check
+is only performed when a transaction itself requests a lock!
+-------------------------------------------------------------------------
+
+An insert is allowed to a gap if there are no explicit lock requests by
+other transactions on the next record. It does not matter if these lock
+requests are granted or waiting, gap bit set or not, with the exception
+that a gap type request set by another transaction to wait for
+its turn to do an insert is ignored. On the other hand, an
+implicit x-lock by another transaction does not prevent an insert, which
+allows for more concurrency when using an Oracle-style sequence number
+generator for the primary key with many transactions doing inserts
+concurrently.
+
+A modify of a record is allowed if the transaction has an x-lock on the
+record, or if other transactions do not have any non-gap lock requests on the
+record.
+
+A read of a single user record with a cursor is allowed if the transaction
+has a non-gap explicit, or an implicit lock on the record, or if the other
+transactions have no x-lock requests on the record. At a page supremum a
+read is always allowed.
+
+In summary, an implicit lock is seen as a granted x-lock only on the
+record, not on the gap. An explicit lock with no gap bit set is a lock
+both on the record and the gap. If the gap bit is set, the lock is only
+on the gap. Different transaction cannot own conflicting locks on the
+record at the same time, but they may own conflicting locks on the gap.
+Granted locks on a record give an access right to the record, but gap type
+locks just inhibit operations.
+
+NOTE: Finding out if some transaction has an implicit x-lock on a secondary
+index record can be cumbersome. We may have to look at previous versions of
+the corresponding clustered index record to find out if a delete marked
+secondary index record was delete marked by an active transaction, not by
+a committed one.
+
+FACT A: If a transaction has inserted a row, it can delete it any time
+without need to wait for locks.
+
+PROOF: The transaction has an implicit x-lock on every index record inserted
+for the row, and can thus modify each record without the need to wait. Q.E.D.
+
+FACT B: If a transaction has read some result set with a cursor, it can read
+it again, and retrieves the same result set, if it has not modified the
+result set in the meantime. Hence, there is no phantom problem. If the
+biggest record, in the alphabetical order, touched by the cursor is removed,
+a lock wait may occur, otherwise not.
+
+PROOF: When a read cursor proceeds, it sets an s-lock on each user record
+it passes, and a gap type s-lock on each page supremum. The cursor must
+wait until it has these locks granted. Then no other transaction can
+have a granted x-lock on any of the user records, and therefore cannot
+modify the user records. Neither can any other transaction insert into
+the gaps which were passed over by the cursor. Page splits and merges,
+and removal of obsolete versions of records do not affect this, because
+when a user record or a page supremum is removed, the next record inherits
+its locks as gap type locks, and therefore blocks inserts to the same gap.
+Also, if a page supremum is inserted, it inherits its locks from the successor
+record. When the cursor is positioned again at the start of the result set,
+the records it will touch on its course are either records it touched
+during the last pass or new inserted page supremums. It can immediately
+access all these records, and when it arrives at the biggest record, it
+notices that the result set is complete. If the biggest record was removed,
+lock wait can occur because the next record only inherits a gap type lock,
+and a wait may be needed. Q.E.D. */
+
+/* If an index record should be changed or a new inserted, we must check
+the lock on the record or the next. When a read cursor starts reading,
+we will set a record level s-lock on each record it passes, except on the
+initial record on which the cursor is positioned before we start to fetch
+records. Our index tree search has the convention that the B-tree
+cursor is positioned BEFORE the first possibly matching record in
+the search. Optimizations are possible here: if the record is searched
+on an equality condition to a unique key, we could actually set a special
+lock on the record, a lock which would not prevent any insert before
+this record. In the next key locking an x-lock set on a record also
+prevents inserts just before that record.
+	There are special infimum and supremum records on each page.
+A supremum record can be locked by a read cursor. This records cannot be
+updated but the lock prevents insert of a user record to the end of
+the page.
+	Next key locks will prevent the phantom problem where new rows
+could appear to SELECT result sets after the select operation has been
+performed. Prevention of phantoms ensures the serilizability of
+transactions.
+	What should we check if an insert of a new record is wanted?
+Only the lock on the next record on the same page, because also the
+supremum record can carry a lock. An s-lock prevents insertion, but
+what about an x-lock? If it was set by a searched update, then there
+is implicitly an s-lock, too, and the insert should be prevented.
+What if our transaction owns an x-lock to the next record, but there is
+a waiting s-lock request on the next record? If this s-lock was placed
+by a read cursor moving in the ascending order in the index, we cannot
+do the insert immediately, because when we finally commit our transaction,
+the read cursor should see also the new inserted record. So we should
+move the read cursor backward from the next record for it to pass over
+the new inserted record. This move backward may be too cumbersome to
+implement. If we in this situation just enqueue a second x-lock request
+for our transaction on the next record, then the deadlock mechanism
+notices a deadlock between our transaction and the s-lock request
+transaction. This seems to be an ok solution.
+	We could have the convention that granted explicit record locks,
+lock the corresponding records from changing, and also lock the gaps
+before them from inserting. A waiting explicit lock request locks the gap
+before from inserting. Implicit record x-locks, which we derive from the
+transaction id in the clustered index record, only lock the record itself
+from modification, not the gap before it from inserting.
+	How should we store update locks? If the search is done by a unique
+key, we could just modify the record trx id. Otherwise, we could put a record
+x-lock on the record. If the update changes ordering fields of the
+clustered index record, the inserted new record needs no record lock in
+lock table, the trx id is enough. The same holds for a secondary index
+record. Searched delete is similar to update.
+
+PROBLEM:
+What about waiting lock requests? If a transaction is waiting to make an
+update to a record which another modified, how does the other transaction
+know to send the end-lock-wait signal to the waiting transaction? If we have
+the convention that a transaction may wait for just one lock at a time, how
+do we preserve it if lock wait ends?
+
+PROBLEM:
+Checking the trx id label of a secondary index record. In the case of a
+modification, not an insert, is this necessary? A secondary index record
+is modified only by setting or resetting its deleted flag. A secondary index
+record contains fields to uniquely determine the corresponding clustered
+index record. A secondary index record is therefore only modified if we
+also modify the clustered index record, and the trx id checking is done
+on the clustered index record, before we come to modify the secondary index
+record. So, in the case of delete marking or unmarking a secondary index
+record, we do not have to care about trx ids, only the locks in the lock
+table must be checked. In the case of a select from a secondary index, the
+trx id is relevant, and in this case we may have to search the clustered
+index record.
+
+PROBLEM: How to update record locks when page is split or merged, or
+--------------------------------------------------------------------
+a record is deleted or updated?
+If the size of fields in a record changes, we perform the update by
+a delete followed by an insert. How can we retain the locks set or
+waiting on the record? Because a record lock is indexed in the bitmap
+by the heap number of the record, when we remove the record from the
+record list, it is possible still to keep the lock bits. If the page
+is reorganized, we could make a table of old and new heap numbers,
+and permute the bitmaps in the locks accordingly. We can add to the
+table a row telling where the updated record ended. If the update does
+not require a reorganization of the page, we can simply move the lock
+bits for the updated record to the position determined by its new heap
+number (we may have to allocate a new lock, if we run out of the bitmap
+in the old one).
+	A more complicated case is the one where the reinsertion of the
+updated record is done pessimistically, because the structure of the
+tree may change.
+
+PROBLEM: If a supremum record is removed in a page merge, or a record
+---------------------------------------------------------------------
+removed in a purge, what to do to the waiting lock requests? In a split to
+the right, we just move the lock requests to the new supremum. If a record
+is removed, we could move the waiting lock request to its inheritor, the
+next record in the index. But, the next record may already have lock
+requests on its own queue. A new deadlock check should be made then. Maybe
+it is easier just to release the waiting transactions. They can then enqueue
+new lock requests on appropriate records.
+
+PROBLEM: When a record is inserted, what locks should it inherit from the
+-------------------------------------------------------------------------
+upper neighbor? An insert of a new supremum record in a page split is
+always possible, but an insert of a new user record requires that the upper
+neighbor does not have any lock requests by other transactions, granted or
+waiting, in its lock queue. Solution: We can copy the locks as gap type
+locks, so that also the waiting locks are transformed to granted gap type
+locks on the inserted record. */
+
+/* LOCK COMPATIBILITY MATRIX
+ *    IS IX S  X  AI
+ * IS +	 +  +  -  +
+ * IX +	 +  -  -  +
+ * S  +	 -  +  -  -
+ * X  -	 -  -  -  -
+ * AI +	 +  -  -  -
+ *
+ * Note that for rows, InnoDB only acquires S or X locks.
+ * For tables, InnoDB normally acquires IS or IX locks.
+ * S or X table locks are only acquired for LOCK TABLES.
+ * Auto-increment (AI) locks are needed because of
+ * statement-level MySQL binlog.
+ * See also lock_mode_compatible().
+ */
+static const byte lock_compatibility_matrix[5][5] = {
+ /**         IS     IX       S     X       AI */
+ /* IS */ {  TRUE,  TRUE,  TRUE,  FALSE,  TRUE},
+ /* IX */ {  TRUE,  TRUE,  FALSE, FALSE,  TRUE},
+ /* S  */ {  TRUE,  FALSE, TRUE,  FALSE,  FALSE},
+ /* X  */ {  FALSE, FALSE, FALSE, FALSE,  FALSE},
+ /* AI */ {  TRUE,  TRUE,  FALSE, FALSE,  FALSE}
+};
+
+/* STRONGER-OR-EQUAL RELATION (mode1=row, mode2=column)
+ *    IS IX S  X  AI
+ * IS +  -  -  -  -
+ * IX +  +  -  -  -
+ * S  +  -  +  -  -
+ * X  +  +  +  +  +
+ * AI -  -  -  -  +
+ * See lock_mode_stronger_or_eq().
+ */
+static const byte lock_strength_matrix[5][5] = {
+ /**         IS     IX       S     X       AI */
+ /* IS */ {  TRUE,  FALSE, FALSE,  FALSE, FALSE},
+ /* IX */ {  TRUE,  TRUE,  FALSE, FALSE,  FALSE},
+ /* S  */ {  TRUE,  FALSE, TRUE,  FALSE,  FALSE},
+ /* X  */ {  TRUE,  TRUE,  TRUE,  TRUE,   TRUE},
+ /* AI */ {  FALSE, FALSE, FALSE, FALSE,  TRUE}
+};
+
+#define PRDT_HEAPNO	PAGE_HEAP_NO_INFIMUM
+/** Record locking request status */
+enum lock_rec_req_status {
+        /** Failed to acquire a lock */
+        LOCK_REC_FAIL,
+        /** Succeeded in acquiring a lock (implicit or already acquired) */
+        LOCK_REC_SUCCESS,
+        /** Explicitly created a new lock */
+        LOCK_REC_SUCCESS_CREATED
+};
+
+#ifdef UNIV_DEBUG
+/** The count of the types of locks. */
+static const ulint      lock_types = UT_ARR_SIZE(lock_compatibility_matrix);
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Gets the previous record lock set on a record.
+@return previous lock on the same record, NULL if none exists */
+const lock_t*
+lock_rec_get_prev(
+/*==============*/
+	const lock_t*	in_lock,/*!< in: record lock */
+	ulint		heap_no);/*!< in: heap number of the record */
+
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index.
+@return transaction id of the transaction which has the x-lock, or 0 */
+UNIV_INLINE
+trx_id_t
+lock_clust_rec_some_has_impl(
+/*=========================*/
+	const rec_t*		rec,	/*!< in: user record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_offs*		offsets)/*!< in: rec_get_offsets(rec, index) */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Gets the first or next record lock on a page.
+@return next lock, NULL if none exists */
+UNIV_INLINE
+const lock_t*
+lock_rec_get_next_on_page_const(
+/*============================*/
+	const lock_t*	lock);	/*!< in: a record lock */
+
+/*********************************************************************//**
+Gets the nth bit of a record lock.
+@return TRUE if bit set also if i == ULINT_UNDEFINED return FALSE*/
+UNIV_INLINE
+ibool
+lock_rec_get_nth_bit(
+/*=================*/
+	const lock_t*	lock,	/*!< in: record lock */
+	ulint		i);	/*!< in: index of the bit */
+
+/*********************************************************************//**
+Gets the number of bits in a record lock bitmap.
+@return number of bits */
+UNIV_INLINE
+ulint
+lock_rec_get_n_bits(
+/*================*/
+	const lock_t*	lock);	/*!< in: record lock */
+
+/**********************************************************************//**
+Sets the nth bit of a record lock to TRUE. */
+inline
+void
+lock_rec_set_nth_bit(
+/*=================*/
+	lock_t*	lock,	/*!< in: record lock */
+	ulint	i);	/*!< in: index of the bit */
+
+/** Reset the nth bit of a record lock.
+@param[in,out] lock record lock
+@param[in] i index of the bit that will be reset
+@return previous value of the bit */
+inline byte lock_rec_reset_nth_bit(lock_t* lock, ulint i)
+{
+	ut_ad(!lock->is_table());
+#ifdef SUX_LOCK_GENERIC
+	ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner());
+#else
+	ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner()
+	      || (xtest() && !lock->trx->mutex_is_locked()));
+#endif
+	ut_ad(i < lock->un_member.rec_lock.n_bits);
+
+	byte*	b = reinterpret_cast<byte*>(&lock[1]) + (i >> 3);
+	byte	mask = byte(1U << (i & 7));
+	byte	bit = *b & mask;
+	*b &= byte(~mask);
+
+	if (bit != 0) {
+		ut_d(auto n=)
+		lock->trx->lock.n_rec_locks--;
+		ut_ad(n);
+	}
+
+	return(bit);
+}
+
+/*********************************************************************//**
+Gets the first or next record lock on a page.
+@return next lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next_on_page(
+/*======================*/
+	lock_t*		lock);		/*!< in: a record lock */
+
+/*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next(
+/*==============*/
+	ulint	heap_no,/*!< in: heap number of the record */
+	lock_t*	lock);	/*!< in: lock */
+
+/*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+const lock_t*
+lock_rec_get_next_const(
+/*====================*/
+	ulint		heap_no,/*!< in: heap number of the record */
+	const lock_t*	lock);	/*!< in: lock */
+
+/** Get the first explicit lock request on a record.
+@param cell     first lock hash table cell
+@param id       page identifier
+@param heap_no  record identifier in page
+@return first lock
+@retval nullptr if none exists */
+inline lock_t *lock_sys_t::get_first(const hash_cell_t &cell, page_id_t id,
+                                     ulint heap_no)
+{
+  lock_sys.assert_locked(cell);
+
+  for (lock_t *lock= static_cast<lock_t*>(cell.node); lock; lock= lock->hash)
+  {
+    ut_ad(!lock->is_table());
+    if (lock->un_member.rec_lock.page_id == id &&
+        lock_rec_get_nth_bit(lock, heap_no))
+      return lock;
+  }
+  return nullptr;
+}
+
+/*********************************************************************//**
+Calculates if lock mode 1 is compatible with lock mode 2.
+@return nonzero if mode1 compatible with mode2 */
+UNIV_INLINE
+ulint
+lock_mode_compatible(
+/*=================*/
+	enum lock_mode	mode1,	/*!< in: lock mode */
+	enum lock_mode	mode2);	/*!< in: lock mode */
+
+/*********************************************************************//**
+Calculates if lock mode 1 is stronger or equal to lock mode 2.
+@return nonzero if mode1 stronger or equal to mode2 */
+UNIV_INLINE
+ulint
+lock_mode_stronger_or_eq(
+/*=====================*/
+	enum lock_mode	mode1,	/*!< in: lock mode */
+	enum lock_mode	mode2);	/*!< in: lock mode */
+
+/*********************************************************************//**
+Checks if a transaction has the specified table lock, or stronger. This
+function should only be called by the thread that owns the transaction.
+@return lock or NULL */
+UNIV_INLINE
+const lock_t*
+lock_table_has(
+/*===========*/
+	const trx_t*		trx,	/*!< in: transaction */
+	const dict_table_t*	table,	/*!< in: table */
+	enum lock_mode		mode);	/*!< in: lock mode */
+
+#include "lock0priv.inl"
+
+#endif /* lock0priv_h */
diff --git a/storage/innobase/include/lock0priv.inl b/storage/innobase/include/lock0priv.inl
new file mode 100644
index 00000000..3b4ebcc8
--- /dev/null
+++ b/storage/innobase/include/lock0priv.inl
@@ -0,0 +1,255 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0priv.ic
+Lock module internal inline methods.
+
+Created July 16, 2007 Vasil Dimov
+*******************************************************/
+
+/* This file contains only methods which are used in
+lock/lock0* files, other than lock/lock0lock.cc.
+I.e. lock/lock0lock.cc contains more internal inline
+methods but they are used only in that file. */
+
+#ifndef LOCK_MODULE_IMPLEMENTATION
+#error Do not include lock0priv.ic outside of the lock/ module
+#endif
+
+#include "row0row.h"
+
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index.
+@return transaction id of the transaction which has the x-lock, or 0 */
+UNIV_INLINE
+trx_id_t
+lock_clust_rec_some_has_impl(
+/*=========================*/
+	const rec_t*		rec,	/*!< in: user record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_offs*		offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(page_rec_is_user_rec(rec));
+
+	return(row_get_rec_trx_id(rec, index, offsets));
+}
+
+/*********************************************************************//**
+Gets the number of bits in a record lock bitmap.
+@return	number of bits */
+UNIV_INLINE
+ulint
+lock_rec_get_n_bits(
+/*================*/
+	const lock_t*	lock)	/*!< in: record lock */
+{
+	return(lock->un_member.rec_lock.n_bits);
+}
+
+/**********************************************************************//**
+Sets the nth bit of a record lock to TRUE. */
+inline
+void
+lock_rec_set_nth_bit(
+/*=================*/
+	lock_t*	lock,	/*!< in: record lock */
+	ulint	i)	/*!< in: index of the bit */
+{
+	ulint	byte_index;
+	ulint	bit_index;
+
+	ut_ad(!lock->is_table());
+	ut_ad(i < lock->un_member.rec_lock.n_bits);
+
+	byte_index = i / 8;
+	bit_index = i % 8;
+
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 4 and 5 need this here */
+#endif
+	((byte*) &lock[1])[byte_index] |= static_cast<byte>(1 << bit_index);
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+#ifdef SUX_LOCK_GENERIC
+	ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner());
+#else
+	ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner()
+	      || (xtest() && !lock->trx->mutex_is_locked()));
+#endif
+	lock->trx->lock.n_rec_locks++;
+}
+
+/*********************************************************************//**
+Gets the first or next record lock on a page.
+@return	next lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next_on_page(
+/*======================*/
+	lock_t*	lock)	/*!< in: a record lock */
+{
+  return const_cast<lock_t*>(lock_rec_get_next_on_page_const(lock));
+}
+
+/*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return	next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next(
+/*==============*/
+	ulint	heap_no,/*!< in: heap number of the record */
+	lock_t*	lock)	/*!< in: lock */
+{
+	do {
+		lock = lock_rec_get_next_on_page(lock);
+	} while (lock && !lock_rec_get_nth_bit(lock, heap_no));
+
+	return(lock);
+}
+
+/*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return	next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+const lock_t*
+lock_rec_get_next_const(
+/*====================*/
+	ulint		heap_no,/*!< in: heap number of the record */
+	const lock_t*	lock)	/*!< in: lock */
+{
+  return lock_rec_get_next(heap_no, const_cast<lock_t*>(lock));
+}
+
+/*********************************************************************//**
+Gets the nth bit of a record lock.
+@return TRUE if bit set also if i == ULINT_UNDEFINED return FALSE*/
+UNIV_INLINE
+ibool
+lock_rec_get_nth_bit(
+/*=================*/
+	const lock_t*	lock,	/*!< in: record lock */
+	ulint		i)	/*!< in: index of the bit */
+{
+	const byte*     b;
+
+	ut_ad(!lock->is_table());
+
+	if (i >= lock->un_member.rec_lock.n_bits) {
+
+		return(FALSE);
+	}
+
+	b = ((const byte*) &lock[1]) + (i / 8);
+
+	return(1 & *b >> (i % 8));
+}
+
+/*********************************************************************//**
+Gets the first or next record lock on a page.
+@return next lock, NULL if none exists */
+UNIV_INLINE
+const lock_t*
+lock_rec_get_next_on_page_const(
+/*============================*/
+	const lock_t*	lock)	/*!< in: a record lock */
+{
+  ut_ad(!lock->is_table());
+
+  const page_id_t page_id{lock->un_member.rec_lock.page_id};
+
+  while (!!(lock= static_cast<const lock_t*>(HASH_GET_NEXT(hash, lock))))
+    if (lock->un_member.rec_lock.page_id == page_id)
+      break;
+  return lock;
+}
+
+/*********************************************************************//**
+Calculates if lock mode 1 is compatible with lock mode 2.
+@return nonzero if mode1 compatible with mode2 */
+UNIV_INLINE
+ulint
+lock_mode_compatible(
+/*=================*/
+	enum lock_mode	mode1,	/*!< in: lock mode */
+	enum lock_mode	mode2)	/*!< in: lock mode */
+{
+	ut_ad((ulint) mode1 < lock_types);
+	ut_ad((ulint) mode2 < lock_types);
+
+	return(lock_compatibility_matrix[mode1][mode2]);
+}
+
+/*********************************************************************//**
+Calculates if lock mode 1 is stronger or equal to lock mode 2.
+@return nonzero if mode1 stronger or equal to mode2 */
+UNIV_INLINE
+ulint
+lock_mode_stronger_or_eq(
+/*=====================*/
+	enum lock_mode	mode1,	/*!< in: lock mode */
+	enum lock_mode	mode2)	/*!< in: lock mode */
+{
+	ut_ad((ulint) mode1 < lock_types);
+	ut_ad((ulint) mode2 < lock_types);
+
+	return(lock_strength_matrix[mode1][mode2]);
+}
+
+/*********************************************************************//**
+Checks if a transaction has the specified table lock, or stronger. This
+function should only be called by the thread that owns the transaction.
+@return lock or NULL */
+UNIV_INLINE
+const lock_t*
+lock_table_has(
+/*===========*/
+	const trx_t*		trx,	/*!< in: transaction */
+	const dict_table_t*	table,	/*!< in: table */
+	lock_mode		in_mode)/*!< in: lock mode */
+{
+	/* Look for stronger locks the same trx already has on the table */
+
+	for (lock_list::const_iterator it = trx->lock.table_locks.begin(),
+             end = trx->lock.table_locks.end(); it != end; ++it) {
+
+		const lock_t*	lock = *it;
+
+		if (lock == NULL) {
+			continue;
+		}
+
+		ut_ad(trx == lock->trx);
+		ut_ad(lock->is_table());
+		ut_ad(lock->un_member.tab_lock.table);
+
+		if (table == lock->un_member.tab_lock.table
+		    && lock_mode_stronger_or_eq(lock->mode(), in_mode)) {
+			ut_ad(!lock->is_waiting());
+			return(lock);
+		}
+	}
+
+	return(NULL);
+}
diff --git a/storage/innobase/include/lock0types.h b/storage/innobase/include/lock0types.h
new file mode 100644
index 00000000..0d00b4b3
--- /dev/null
+++ b/storage/innobase/include/lock0types.h
@@ -0,0 +1,251 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0types.h
+The transaction lock system global types
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0types.h"
+#include "buf0types.h"
+#include "ut0lst.h"
+
+#ifndef lock0types_h
+#define lock0types_h
+
+#define lock_t ib_lock_t
+
+struct lock_t;
+struct lock_table_t;
+
+/* Basic lock modes */
+enum lock_mode {
+	LOCK_IS = 0,	/* intention shared */
+	LOCK_IX,	/* intention exclusive */
+	LOCK_S,		/* shared */
+	LOCK_X,		/* exclusive */
+	LOCK_AUTO_INC,	/* locks the auto-inc counter of a table
+			in an exclusive mode */
+	LOCK_NONE,	/* this is used elsewhere to note consistent read */
+	LOCK_NUM = LOCK_NONE, /* number of lock modes */
+	LOCK_NONE_UNSET = 7
+};
+
+/** A table lock */
+struct lock_table_t {
+	dict_table_t*	table;		/*!< database table in dictionary
+					cache */
+	UT_LIST_NODE_T(ib_lock_t)
+			locks;		/*!< list of locks on the same
+					table */
+	/** Print the table lock into the given output stream
+	@param[in,out]	out	the output stream
+	@return the given output stream. */
+	std::ostream& print(std::ostream& out) const;
+};
+
+/** Record lock for a page */
+struct lock_rec_t {
+	/** page identifier */
+	page_id_t	page_id;
+	ib_uint32_t	n_bits;		/*!< number of bits in the lock
+					bitmap; NOTE: the lock bitmap is
+					placed immediately after the
+					lock struct */
+
+	/** Print the record lock into the given output stream
+	@param[in,out]	out	the output stream
+	@return the given output stream. */
+	std::ostream& print(std::ostream& out) const;
+};
+
+/** Print the record lock into the given output stream
+@param[in,out]	out	the output stream
+@return the given output stream. */
+inline std::ostream &lock_rec_t::print(std::ostream &out) const
+{
+  out << "[lock_rec_t: space=" << page_id.space()
+      << ", page_no=" << page_id.page_no()
+      << ", n_bits=" << n_bits << "]";
+  return out;
+}
+
+inline
+std::ostream&
+operator<<(std::ostream& out, const lock_rec_t& lock)
+{
+	return(lock.print(out));
+}
+
+#define LOCK_MODE_MASK	0x7	/*!< mask used to extract mode from the
+				type_mode field in a lock */
+/** Lock types */
+/* @{ */
+/** table lock (record lock if the flag is not set) */
+#define LOCK_TABLE	8U
+
+#define LOCK_WAIT	256U	/*!< Waiting lock flag; when set, it
+				means that the lock has not yet been
+				granted, it is just waiting for its
+				turn in the wait queue */
+/* Precise modes */
+#define LOCK_ORDINARY	0	/*!< this flag denotes an ordinary
+				next-key lock in contrast to LOCK_GAP
+				or LOCK_REC_NOT_GAP */
+#define LOCK_GAP	512U	/*!< when this bit is set, it means that the
+				lock holds only on the gap before the record;
+				for instance, an x-lock on the gap does not
+				give permission to modify the record on which
+				the bit is set; locks of this type are created
+				when records are removed from the index chain
+				of records */
+#define LOCK_REC_NOT_GAP 1024U	/*!< this bit means that the lock is only on
+				the index record and does NOT block inserts
+				to the gap before the index record; this is
+				used in the case when we retrieve a record
+				with a unique key, and is also used in
+				locking plain SELECTs (not part of UPDATE
+				or DELETE) when the user has set the READ
+				COMMITTED isolation level */
+#define LOCK_INSERT_INTENTION 2048U/*!< this bit is set when we place a waiting
+				gap type record lock request in order to let
+				an insert of an index record to wait until
+				there are no conflicting locks by other
+				transactions on the gap; note that this flag
+				remains set when the waiting lock is granted,
+				or if the lock is inherited to a neighboring
+				record */
+#define LOCK_PREDICATE	8192U	/*!< Predicate lock */
+#define LOCK_PRDT_PAGE	16384U	/*!< Page lock */
+
+
+#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION|LOCK_PREDICATE|LOCK_PRDT_PAGE)&LOCK_MODE_MASK
+# error
+#endif
+#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION|LOCK_PREDICATE|LOCK_PRDT_PAGE)&LOCK_TYPE_MASK
+# error
+#endif
+/* @} */
+
+/**
+Checks if the `mode` is LOCK_S or LOCK_X (possibly ORed with LOCK_WAIT or
+LOCK_REC) which means the lock is a
+Next Key Lock, a.k.a. LOCK_ORDINARY, as opposed to Predicate Lock,
+GAP lock, Insert Intention or Record Lock.
+@param  mode  A mode and flags, of a lock.
+@return true if the only bits set in `mode` are LOCK_S or LOCK_X and optionally
+LOCK_WAIT or LOCK_REC */
+static inline bool lock_mode_is_next_key_lock(ulint mode)
+{
+  static_assert(LOCK_ORDINARY == 0, "LOCK_ORDINARY must be 0 (no flags)");
+  ut_ad((mode & LOCK_TABLE) == 0);
+  mode&= ~LOCK_WAIT;
+  ut_ad((mode & LOCK_WAIT) == 0);
+  ut_ad(((mode & ~(LOCK_MODE_MASK)) == LOCK_ORDINARY) ==
+        (mode == LOCK_S || mode == LOCK_X));
+  return (mode & ~(LOCK_MODE_MASK)) == LOCK_ORDINARY;
+}
+
+/** Lock struct; protected by lock_sys.latch */
+struct ib_lock_t
+{
+  /** the owner of the lock */
+  trx_t *trx;
+  /** other locks of the transaction; protected by
+  lock_sys.is_writer() and trx->mutex_is_owner(); @see trx_lock_t::trx_locks */
+  UT_LIST_NODE_T(ib_lock_t) trx_locks;
+
+	dict_index_t*	index;		/*!< index for a record lock */
+
+	ib_lock_t*	hash;		/*!< hash chain node for a record
+					lock. The link node in a singly linked
+					list, used during hashing. */
+
+	/** time(NULL) of the lock request creation.
+	Used for computing wait_time and diagnostics only.
+	Note: bogus durations may be reported
+	when the system time is adjusted! */
+	time_t		requested_time;
+	/** Cumulated wait time in seconds.
+	Note: may be bogus when the system time is adjusted! */
+	ulint		wait_time;
+
+	union {
+		lock_table_t	tab_lock;/*!< table lock */
+		lock_rec_t	rec_lock;/*!< record lock */
+	} un_member;			/*!< lock details */
+
+	ib_uint32_t	type_mode;	/*!< lock type, mode, LOCK_GAP or
+					LOCK_REC_NOT_GAP,
+					LOCK_INSERT_INTENTION,
+					wait flag, ORed */
+
+	bool is_waiting() const
+	{
+		return(type_mode & LOCK_WAIT);
+	}
+
+	bool is_gap() const
+	{
+		return(type_mode & LOCK_GAP);
+	}
+
+	bool is_record_not_gap() const
+	{
+		return(type_mode & LOCK_REC_NOT_GAP);
+	}
+
+	/** @return true if the lock is a Next Key Lock */
+	bool is_next_key_lock() const
+	{
+		return !(type_mode & LOCK_TABLE) &&
+		       lock_mode_is_next_key_lock(type_mode);
+	}
+
+	bool is_insert_intention() const
+	{
+		return(type_mode & LOCK_INSERT_INTENTION);
+	}
+
+	bool is_table() const { return type_mode & LOCK_TABLE; }
+
+	enum lock_mode mode() const
+	{
+		return(static_cast<enum lock_mode>(type_mode & LOCK_MODE_MASK));
+	}
+
+        bool is_rec_granted_exclusive_not_gap() const
+        {
+          return (type_mode & (LOCK_MODE_MASK | LOCK_GAP)) == LOCK_X;
+        }
+
+	/** Print the lock object into the given output stream.
+	@param[in,out]	out	the output stream
+	@return the given output stream. */
+	std::ostream& print(std::ostream& out) const;
+
+	const char* type_string() const
+	{ return is_table() ? "LOCK_TABLE" : "LOCK_REC"; }
+};
+
+typedef UT_LIST_BASE_NODE_T(ib_lock_t) trx_lock_list_t;
+
+#endif /* lock0types_h */
diff --git a/storage/innobase/include/log0crypt.h b/storage/innobase/include/log0crypt.h
new file mode 100644
index 00000000..22c0c963
--- /dev/null
+++ b/storage/innobase/include/log0crypt.h
@@ -0,0 +1,115 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
+Copyright (C) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**************************************************//**
+@file include/log0crypt.h
+Innodb log encrypt/decrypt
+
+Created 11/25/2013 Minli Zhu
+Modified           Jan Lindström jan.lindstrom@mariadb.com
+MDEV-11782: Rewritten for MariaDB 10.2 by Marko Mäkelä, MariaDB Corporation.
+*******************************************************/
+#pragma once
+
+#include "log0log.h"
+
+/** Initialize the redo log encryption key and random parameters
+when creating a new redo log.
+The random parameters will be persisted in the log header.
+@see log_crypt_write_header()
+@see log_crypt_read_header()
+@return whether the operation succeeded */
+bool log_crypt_init();
+
+/** Add the encryption information to the log header buffer.
+@param buf   part of log header buffer */
+void log_crypt_write_header(byte *buf);
+
+/** Read the encryption information from a redo log checkpoint buffer.
+@param buf   part of checkpoint buffer
+@return whether the operation was successful */
+bool log_crypt_read_header(const byte *buf);
+
+/** Read the MariaDB 10.1 checkpoint crypto (version, msg and iv) info.
+@param[in]	buf	checkpoint buffer
+@return	whether the operation was successful */
+ATTRIBUTE_COLD bool log_crypt_101_read_checkpoint(const byte* buf);
+
+/** Decrypt a MariaDB 10.1 redo log block.
+@param[in,out]	buf		log block
+@param[in]	start_lsn	server start LSN
+@return	whether the decryption was successful */
+ATTRIBUTE_COLD bool log_crypt_101_read_block(byte* buf, lsn_t start_lsn);
+
+/** Read the checkpoint crypto (version, msg and iv) info.
+@param[in]	buf	checkpoint buffer
+@return	whether the operation was successful */
+ATTRIBUTE_COLD bool log_crypt_read_checkpoint_buf(const byte* buf);
+
+/** Decrypt log blocks.
+@param[in,out]	buf	log blocks to decrypt
+@param[in]	lsn	log sequence number of the start of the buffer
+@param[in]	size	size of the buffer, in bytes
+@return	whether the operation succeeded */
+ATTRIBUTE_COLD bool log_decrypt(byte* buf, lsn_t lsn, ulint size);
+
+/** Decrypt part of a log record.
+@param iv    initialization vector
+@param buf   buffer for the decrypted data
+@param data  the encrypted data
+@param len   length of the data, in bytes
+@return buf */
+byte *log_decrypt_buf(const byte *iv, byte *buf, const byte *data, uint len);
+
+/** Decrypt a log snippet.
+@param iv    initialization vector
+@param buf   buffer to be replaced with encrypted contents
+@param end   pointer past the end of buf */
+void log_decrypt_buf(const byte *iv, byte *buf, const byte *const end);
+
+/** Encrypt or decrypt a temporary file block.
+@param[in]	src		block to encrypt or decrypt
+@param[in]	size		size of the block
+@param[out]	dst		destination block
+@param[in]	offs		offset to block
+@param[in]	encrypt		true=encrypt; false=decrypt
+@return whether the operation succeeded */
+bool log_tmp_block_encrypt(
+	const byte*	src,
+	ulint		size,
+	byte*		dst,
+	uint64_t	offs,
+	bool		encrypt = true)
+	MY_ATTRIBUTE((warn_unused_result, nonnull));
+
+/** Decrypt a temporary file block.
+@param[in]	src		block to decrypt
+@param[in]	size		size of the block
+@param[out]	dst		destination block
+@param[in]	offs		offset to block
+@return whether the operation succeeded */
+inline
+bool
+log_tmp_block_decrypt(
+	const byte*	src,
+	ulint		size,
+	byte*		dst,
+	uint64_t	offs)
+{
+	return(log_tmp_block_encrypt(src, size, dst, offs, false));
+}
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
new file mode 100644
index 00000000..f873eabf
--- /dev/null
+++ b/storage/innobase/include/log0log.h
@@ -0,0 +1,529 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0log.h
+Database log
+
+Created 12/9/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "log0types.h"
+#include "os0file.h"
+#include "span.h"
+#include "my_atomic_wrapper.h"
+#include "srw_lock.h"
+#include <string>
+
+using st_::span;
+
+static const char LOG_FILE_NAME_PREFIX[] = "ib_logfile";
+static const char LOG_FILE_NAME[] = "ib_logfile0";
+
+/** Composes full path for a redo log file
+@param[in]	filename	name of the redo log file
+@return path with log file name*/
+std::string get_log_file_path(const char *filename= LOG_FILE_NAME);
+
+/** Delete log file.
+@param[in]	suffix	suffix of the file name */
+static inline void delete_log_file(const char* suffix)
+{
+  auto path = get_log_file_path(LOG_FILE_NAME_PREFIX).append(suffix);
+  os_file_delete_if_exists_func(path.c_str(), nullptr);
+}
+
+struct completion_callback;
+
+/** Ensure that the log has been written to the log file up to a given
+log entry (such as that of a transaction commit). Start a new write, or
+wait and check if an already running write is covering the request.
+@param lsn      log sequence number that should be included in the file write
+@param durable  whether the write needs to be durable
+@param callback log write completion callback */
+void log_write_up_to(lsn_t lsn, bool durable,
+                     const completion_callback *callback= nullptr);
+
+/** Write to the log file up to the last log entry.
+@param durable  whether to wait for a durable write to complete */
+void log_buffer_flush_to_disk(bool durable= true);
+
+
+/** Prepare to invoke log_write_and_flush(), before acquiring log_sys.latch. */
+ATTRIBUTE_COLD void log_write_and_flush_prepare();
+
+/** Durably write the log up to log_sys.get_lsn(). */
+ATTRIBUTE_COLD void log_write_and_flush();
+
+/** Make a checkpoint */
+ATTRIBUTE_COLD void log_make_checkpoint();
+
+/** Make a checkpoint at the latest lsn on shutdown. */
+ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown();
+
+/**
+Checks that there is enough free space in the log to start a new query step.
+Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
+function may only be called if the calling thread owns no synchronization
+objects! */
+ATTRIBUTE_COLD void log_check_margins();
+
+/******************************************************//**
+Prints info of the log. */
+void
+log_print(
+/*======*/
+	FILE*	file);	/*!< in: file where to print */
+
+/** Offsets of a log file header */
+/* @{ */
+/** Log file header format identifier (32-bit unsigned big-endian integer).
+This used to be called LOG_GROUP_ID and always written as 0,
+because InnoDB never supported more than one copy of the redo log. */
+#define LOG_HEADER_FORMAT	0
+/** LSN of the start of data in this log file (with format version 1;
+in format version 0, it was called LOG_FILE_START_LSN and at offset 4). */
+#define LOG_HEADER_START_LSN	8
+/** A null-terminated string which will contain either the string 'ibbackup'
+and the creation time if the log file was created by mysqlbackup --restore,
+or the MySQL version that created the redo log file. */
+#define LOG_HEADER_CREATOR	16
+/** End of the log file creator field. */
+#define LOG_HEADER_CREATOR_END	48
+/* @} */
+
+struct log_t;
+
+/** File abstraction */
+class log_file_t
+{
+  friend log_t;
+  os_file_t m_file{OS_FILE_CLOSED};
+public:
+  log_file_t()= default;
+  log_file_t(os_file_t file) noexcept : m_file(file) {}
+
+  /** Open a file
+  @return file size in bytes
+  @retval 0 if not readable */
+  os_offset_t open(bool read_only) noexcept;
+  bool is_opened() const noexcept { return m_file != OS_FILE_CLOSED; }
+
+  dberr_t close() noexcept;
+  dberr_t read(os_offset_t offset, span<byte> buf) noexcept;
+  void write(os_offset_t offset, span<const byte> buf) noexcept;
+  bool flush() const noexcept { return os_file_flush(m_file); }
+#ifdef HAVE_PMEM
+  byte *mmap(bool read_only, const struct stat &st) noexcept;
+#endif
+};
+
+/** Redo log buffer */
+struct log_t
+{
+  /** The original (not version-tagged) InnoDB redo log format */
+  static constexpr uint32_t FORMAT_3_23= 0;
+  /** The MySQL 5.7.9/MariaDB 10.2.2 log format */
+  static constexpr uint32_t FORMAT_10_2= 1;
+  /** The MariaDB 10.3.2 log format. */
+  static constexpr uint32_t FORMAT_10_3= 103;
+  /** The MariaDB 10.4.0 log format. */
+  static constexpr uint32_t FORMAT_10_4= 104;
+  /** Encrypted MariaDB redo log */
+  static constexpr uint32_t FORMAT_ENCRYPTED= 1U << 31;
+  /** The MariaDB 10.4.0 log format (only with innodb_encrypt_log=ON) */
+  static constexpr uint32_t FORMAT_ENC_10_4= FORMAT_10_4 | FORMAT_ENCRYPTED;
+  /** The MariaDB 10.5.1 physical redo log format */
+  static constexpr uint32_t FORMAT_10_5= 0x50485953;
+  /** The MariaDB 10.5.1 physical format (only with innodb_encrypt_log=ON) */
+  static constexpr uint32_t FORMAT_ENC_10_5= FORMAT_10_5 | FORMAT_ENCRYPTED;
+  /** The MariaDB 10.8.0 variable-block-size redo log format */
+  static constexpr uint32_t FORMAT_10_8= 0x50687973;
+  /** The MariaDB 10.8.0 format with innodb_encrypt_log=ON */
+  static constexpr uint32_t FORMAT_ENC_10_8= FORMAT_10_8 | FORMAT_ENCRYPTED;
+
+  /** Location of the first checkpoint block */
+  static constexpr size_t CHECKPOINT_1= 4096;
+  /** Location of the second checkpoint block */
+  static constexpr size_t CHECKPOINT_2= 8192;
+  /** Start of record payload */
+  static constexpr lsn_t START_OFFSET= 12288;
+
+  /** smallest possible log sequence number in the current format
+  (used to be 2048 before FORMAT_10_8). */
+  static constexpr lsn_t FIRST_LSN= START_OFFSET;
+
+private:
+  /** The log sequence number of the last change of durable InnoDB files */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE)
+  std::atomic<lsn_t> lsn;
+  /** the first guaranteed-durable log sequence number */
+  std::atomic<lsn_t> flushed_to_disk_lsn;
+  /** log sequence number when log resizing was initiated, or 0 */
+  std::atomic<lsn_t> resize_lsn;
+  /** set when there may be need to flush the log buffer, or
+  preflush buffer pool pages, or initiate a log checkpoint.
+  This must hold if lsn - last_checkpoint_lsn > max_checkpoint_age. */
+  std::atomic<bool> check_flush_or_checkpoint_;
+
+
+#if defined(__aarch64__)
+/* On ARM, we do more spinning */
+typedef srw_spin_lock log_rwlock_t;
+#define LSN_LOCK_ATTR MY_MUTEX_INIT_FAST
+#else
+typedef srw_lock log_rwlock_t;
+#define LSN_LOCK_ATTR nullptr
+#endif
+
+public:
+  /** rw-lock protecting buf */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) log_rwlock_t latch;
+private:
+  /** Last written LSN */
+  lsn_t write_lsn;
+public:
+  /** log record buffer, written to by mtr_t::commit() */
+  byte *buf;
+  /** buffer for writing data to ib_logfile0, or nullptr if is_pmem()
+  In write_buf(), buf and flush_buf are swapped */
+  byte *flush_buf;
+  /** number of std::swap(buf, flush_buf) and writes from buf to log;
+  protected by latch.wr_lock() */
+  ulint write_to_log;
+
+  /** Log sequence number when a log file overwrite (broken crash recovery)
+  was noticed. Protected by latch.wr_lock(). */
+  lsn_t overwrite_warned;
+
+  /** innodb_log_buffer_size (size of buf,flush_buf if !is_pmem(), in bytes) */
+  size_t buf_size;
+
+private:
+  /** Log file being constructed during resizing; protected by latch */
+  log_file_t resize_log;
+  /** size of resize_log; protected by latch */
+  lsn_t resize_target;
+  /** Buffer for writing to resize_log; @see buf */
+  byte *resize_buf;
+  /** Buffer for writing to resize_log; @see flush_buf */
+  byte *resize_flush_buf;
+
+  /** spin lock protecting lsn, buf_free in append_prepare() */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) pthread_mutex_t lsn_lock;
+  void init_lsn_lock() { pthread_mutex_init(&lsn_lock, LSN_LOCK_ATTR); }
+  void lock_lsn() { pthread_mutex_lock(&lsn_lock); }
+  void unlock_lsn() { pthread_mutex_unlock(&lsn_lock); }
+  void destroy_lsn_lock() { pthread_mutex_destroy(&lsn_lock); }
+
+public:
+  /** first free offset within buf use; protected by lsn_lock */
+  Atomic_relaxed<size_t> buf_free;
+  /** number of write requests (to buf); protected by exclusive lsn_lock */
+  ulint write_to_buf;
+  /** number of waits in append_prepare(); protected by lsn_lock */
+  ulint waits;
+  /** recommended maximum size of buf, after which the buffer is flushed */
+  size_t max_buf_free;
+
+  /** log file size in bytes, including the header */
+  lsn_t file_size;
+private:
+  /** the log sequence number at the start of the log file */
+  lsn_t first_lsn;
+#if defined __linux__ || defined _WIN32
+  /** The physical block size of the storage */
+  uint32_t block_size;
+#endif
+public:
+  /** format of the redo log: e.g., FORMAT_10_8 */
+  uint32_t format;
+  /** Log file */
+  log_file_t log;
+#if defined __linux__ || defined _WIN32
+  /** whether file system caching is enabled for the log */
+  my_bool log_buffered;
+# ifdef _WIN32
+  static constexpr bool log_maybe_unbuffered= true;
+# else
+  /** whether file system caching may be disabled */
+  bool log_maybe_unbuffered;
+# endif
+#endif
+
+	/** Fields involved in checkpoints @{ */
+	lsn_t		log_capacity;	/*!< capacity of the log; if
+					the checkpoint age exceeds this, it is
+					a serious error because it is possible
+					we will then overwrite log and spoil
+					crash recovery */
+	lsn_t		max_modified_age_async;
+					/*!< when this recommended
+					value for lsn -
+					buf_pool.get_oldest_modification()
+					is exceeded, we start an
+					asynchronous preflush of pool pages */
+	lsn_t		max_checkpoint_age;
+					/*!< this is the maximum allowed value
+					for lsn - last_checkpoint_lsn when a
+					new query step is started */
+  /** latest completed checkpoint (protected by latch.wr_lock()) */
+  Atomic_relaxed<lsn_t> last_checkpoint_lsn;
+  /** next checkpoint LSN (protected by log_sys.latch) */
+  lsn_t next_checkpoint_lsn;
+  /** next checkpoint number (protected by latch.wr_lock()) */
+  ulint next_checkpoint_no;
+  /** whether a checkpoint is pending */
+  Atomic_relaxed<bool> checkpoint_pending;
+
+  /** buffer for checkpoint header */
+  byte *checkpoint_buf;
+	/* @} */
+
+  bool is_initialised() const noexcept { return max_buf_free != 0; }
+
+#ifdef HAVE_PMEM
+  bool is_pmem() const noexcept { return !flush_buf; }
+#else
+  static constexpr bool is_pmem() { return false; }
+#endif
+
+  bool is_opened() const noexcept { return log.is_opened(); }
+
+  /** @return LSN at which log resizing was started and is still in progress
+      @retval 0 if no log resizing is in progress */
+  lsn_t resize_in_progress() const noexcept
+  { return resize_lsn.load(std::memory_order_relaxed); }
+
+  /** Status of resize_start() */
+  enum resize_start_status {
+    RESIZE_NO_CHANGE, RESIZE_IN_PROGRESS, RESIZE_STARTED, RESIZE_FAILED
+  };
+
+  /** Start resizing the log and release the exclusive latch.
+  @param size  requested new file_size
+  @return whether the resizing was started successfully */
+  resize_start_status resize_start(os_offset_t size) noexcept;
+
+  /** Abort any resize_start(). */
+  void resize_abort() noexcept;
+
+  /** Replicate a write to the log.
+  @param lsn  start LSN
+  @param end  end of the mini-transaction
+  @param len  length of the mini-transaction
+  @param seq  offset of the sequence bit from the end */
+  inline void resize_write(lsn_t lsn, const byte *end,
+                           size_t len, size_t seq) noexcept;
+
+  /** Write resize_buf to resize_log.
+  @param length  the used length of resize_buf */
+  ATTRIBUTE_COLD void resize_write_buf(size_t length) noexcept;
+
+  /** Rename a log file after resizing.
+  @return whether an error occurred */
+  static bool resize_rename() noexcept;
+
+#ifdef HAVE_PMEM
+  /** @return pointer for writing to resize_buf
+  @retval nullptr if no PMEM based resizing is active */
+  inline byte *resize_buf_begin(lsn_t lsn) const noexcept;
+  /** @return end of resize_buf */
+  inline const byte *resize_buf_end() const noexcept
+  { return resize_buf + resize_target; }
+
+  /** Initialise the redo log subsystem. */
+  void create_low();
+  /** Initialise the redo log subsystem.
+  @return whether the initialisation succeeded */
+  bool create() { create_low(); return true; }
+
+  /** Attach a log file.
+  @return whether the memory allocation succeeded */
+  bool attach(log_file_t file, os_offset_t size);
+#else
+  /** Initialise the redo log subsystem.
+  @return whether the initialisation succeeded */
+  bool create();
+  /** Attach a log file. */
+  void attach_low(log_file_t file, os_offset_t size);
+  bool attach(log_file_t file, os_offset_t size)
+  { attach_low(file, size); return true; }
+#endif
+
+#if defined __linux__ || defined _WIN32
+  /** Try to enable or disable file system caching (update log_buffered) */
+  void set_buffered(bool buffered);
+#endif
+
+  void close_file();
+
+  /** Calculate the checkpoint safety margins. */
+  static void set_capacity();
+
+  /** Write a log file header.
+  @param buf        log header buffer
+  @param lsn        log sequence number corresponding to log_sys.START_OFFSET
+  @param encrypted  whether the log is encrypted */
+  static void header_write(byte *buf, lsn_t lsn, bool encrypted);
+
+  lsn_t get_lsn(std::memory_order order= std::memory_order_relaxed) const
+  { return lsn.load(order); }
+
+  lsn_t get_flushed_lsn(std::memory_order order= std::memory_order_acquire)
+    const noexcept
+  { return flushed_to_disk_lsn.load(order); }
+
+  /** Initialize the LSN on initial log file creation. */
+  lsn_t init_lsn() noexcept
+  {
+    latch.wr_lock(SRW_LOCK_CALL);
+    const lsn_t lsn{get_lsn()};
+    flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed);
+    write_lsn= lsn;
+    latch.wr_unlock();
+    return lsn;
+  }
+
+  void set_recovered_lsn(lsn_t lsn) noexcept
+  {
+#ifndef SUX_LOCK_GENERIC
+    ut_ad(latch.is_write_locked());
+#endif /* SUX_LOCK_GENERIC */
+    write_lsn= lsn;
+    this->lsn.store(lsn, std::memory_order_relaxed);
+    flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed);
+  }
+
+#ifdef HAVE_PMEM
+  /** Persist the log.
+  @param lsn    desired new value of flushed_to_disk_lsn */
+  inline void persist(lsn_t lsn) noexcept;
+#endif
+
+  bool check_flush_or_checkpoint() const
+  {
+    return UNIV_UNLIKELY
+      (check_flush_or_checkpoint_.load(std::memory_order_relaxed));
+  }
+  void set_check_flush_or_checkpoint(bool flag= true)
+  { check_flush_or_checkpoint_.store(flag, std::memory_order_relaxed); }
+
+  /** Make previous write_buf() durable and update flushed_to_disk_lsn. */
+  bool flush(lsn_t lsn) noexcept;
+
+  /** Shut down the redo log subsystem. */
+  void close();
+
+#if defined __linux__ || defined _WIN32
+  /** @return the physical block size of the storage */
+  size_t get_block_size() const noexcept
+  { ut_ad(block_size); return block_size; }
+  /** Set the log block size for file I/O. */
+  void set_block_size(uint32_t size) noexcept { block_size= size; }
+#else
+  /** @return the physical block size of the storage */
+  static size_t get_block_size() { return 512; }
+#endif
+
+private:
+  /** Wait in append_prepare() for buffer to become available
+  @param ex   whether log_sys.latch is exclusively locked */
+  ATTRIBUTE_COLD static void append_prepare_wait(bool ex) noexcept;
+public:
+  /** Reserve space in the log buffer for appending data.
+  @tparam pmem  log_sys.is_pmem()
+  @param size   total length of the data to append(), in bytes
+  @param ex     whether log_sys.latch is exclusively locked
+  @return the start LSN and the buffer position for append() */
+  template<bool pmem>
+  inline std::pair<lsn_t,byte*> append_prepare(size_t size, bool ex) noexcept;
+
+  /** Append a string of bytes to the redo log.
+  @param d     destination
+  @param s     string of bytes
+  @param size  length of str, in bytes */
+  void append(byte *&d, const void *s, size_t size) noexcept
+  {
+#ifndef SUX_LOCK_GENERIC
+    ut_ad(latch.is_locked());
+#endif
+    ut_ad(d + size <= buf + (is_pmem() ? file_size : buf_size));
+    memcpy(d, s, size);
+    d+= size;
+  }
+
+  /** Set the log file format. */
+  void set_latest_format(bool encrypted) noexcept
+  { format= encrypted ? FORMAT_ENC_10_8 : FORMAT_10_8; }
+  /** @return whether the redo log is encrypted */
+  bool is_encrypted() const noexcept { return format & FORMAT_ENCRYPTED; }
+  /** @return whether the redo log is in the latest format */
+  bool is_latest() const noexcept
+  { return (~FORMAT_ENCRYPTED & format) == FORMAT_10_8; }
+
+  /** @return capacity in bytes */
+  lsn_t capacity() const noexcept { return file_size - START_OFFSET; }
+
+  /** Set the LSN of the log file at file creation. */
+  void set_first_lsn(lsn_t lsn) noexcept { write_lsn= first_lsn= lsn; }
+  /** @return the first LSN of the log file */
+  lsn_t get_first_lsn() const noexcept { return first_lsn; }
+
+  /** Determine the sequence bit at a log sequence number */
+  byte get_sequence_bit(lsn_t lsn) const noexcept
+  {
+    ut_ad(lsn >= first_lsn);
+    return !(((lsn - first_lsn) / capacity()) & 1);
+  }
+
+  /** Calculate the offset of a log sequence number.
+      @param lsn   log sequence number
+      @return byte offset within ib_logfile0 */
+  lsn_t calc_lsn_offset(lsn_t lsn) const noexcept
+  {
+    ut_ad(lsn >= first_lsn);
+    return START_OFFSET + (lsn - first_lsn) % capacity();
+  }
+
+  /** Write checkpoint information and invoke latch.wr_unlock().
+  @param end_lsn    start LSN of the FILE_CHECKPOINT mini-transaction */
+  inline void write_checkpoint(lsn_t end_lsn) noexcept;
+
+  /** Write buf to ib_logfile0.
+  @tparam release_latch whether to invoke latch.wr_unlock()
+  @return the current log sequence number */
+  template<bool release_latch> inline lsn_t write_buf() noexcept;
+
+  /** Create the log. */
+  void create(lsn_t lsn) noexcept;
+};
+
+/** Redo log system */
+extern log_t	log_sys;
+
+/** Wait for a log checkpoint if needed.
+NOTE that this function may only be called while not holding
+any synchronization objects except dict_sys.latch. */
+void log_free_check();
+
+/** Release the latches that protect log resizing. */
+void log_resize_release();
diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h
new file mode 100644
index 00000000..6d75e15a
--- /dev/null
+++ b/storage/innobase/include/log0recv.h
@@ -0,0 +1,491 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0recv.h
+Recovery
+
+Created 9/20/1997 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "ut0new.h"
+#include "buf0types.h"
+#include "log0log.h"
+#include "mtr0types.h"
+
+#include <deque>
+#include <map>
+
+/** @return whether recovery is currently running. */
+#define recv_recovery_is_on() UNIV_UNLIKELY(recv_sys.recovery_on)
+
+ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Apply any buffered redo log to a page.
+@param space     tablespace
+@param bpage     buffer pool page
+@return whether the page was recovered correctly */
+bool recv_recover_page(fil_space_t* space, buf_page_t* bpage);
+
+/** Start recovering from a redo log checkpoint.
+of first system tablespace page
+@return error code or DB_SUCCESS */
+dberr_t recv_recovery_from_checkpoint_start();
+
+/** Report an operation to create, delete, or rename a file during backup.
+@param[in]	space_id	tablespace identifier
+@param[in]	type		file operation redo log type
+@param[in]	name		file name (not NUL-terminated)
+@param[in]	len		length of name, in bytes
+@param[in]	new_name	new file name (NULL if not rename)
+@param[in]	new_len		length of new_name, in bytes (0 if NULL) */
+extern void (*log_file_op)(uint32_t space_id, int type,
+			   const byte* name, ulint len,
+			   const byte* new_name, ulint new_len);
+
+/** Report an operation which does undo log tablespace truncation
+during backup
+@param	space_id	undo tablespace identifier */
+extern void (*undo_space_trunc)(uint32_t space_id);
+
+/** Report an operation which does INIT_PAGE for page0 during backup.
+@param	space_id	tablespace identifier */
+extern void (*first_page_init)(uint32_t space_id);
+
+/** Stored redo log record */
+struct log_rec_t
+{
+  log_rec_t(lsn_t lsn) : next(nullptr), lsn(lsn) { ut_ad(lsn); }
+  log_rec_t()= delete;
+  log_rec_t(const log_rec_t&)= delete;
+  log_rec_t &operator=(const log_rec_t&)= delete;
+
+  /** next record */
+  log_rec_t *next;
+  /** mtr_t::commit_lsn() of the mini-transaction */
+  const lsn_t lsn;
+};
+
+struct recv_dblwr_t
+{
+  /** Add a page frame to the doublewrite recovery buffer. */
+  void add(byte *page) { pages.push_front(page); }
+
+  /** Validate the page.
+  @param page_id  page identifier
+  @param page     page contents
+  @param space    the tablespace of the page (not available for page 0)
+  @param tmp_buf  2*srv_page_size for decrypting and decompressing any
+  page_compressed or encrypted pages
+  @return whether the page is valid */
+  bool validate_page(const page_id_t page_id, const byte *page,
+                     const fil_space_t *space, byte *tmp_buf);
+
+  /** Find a doublewrite copy of a page.
+  @param page_id  page identifier
+  @param space    tablespace (not available for page_id.page_no()==0)
+  @param tmp_buf  2*srv_page_size for decrypting and decompressing any
+  page_compressed or encrypted pages
+  @return page frame
+  @retval NULL if no valid page for page_id was found */
+  byte* find_page(const page_id_t page_id, const fil_space_t *space= NULL,
+                  byte *tmp_buf= NULL);
+
+  /** Restore the first page of the given tablespace from
+  doublewrite buffer.
+  @param space_id  tablespace identifier
+  @param name      tablespace filepath
+  @param file      tablespace file handle
+  @return whether the operation failed */
+  bool restore_first_page(uint32_t space_id, const char *name, os_file_t file);
+
+  typedef std::deque<byte*, ut_allocator<byte*> > list;
+
+  /** Recovered doublewrite buffer page frames */
+  list pages;
+};
+
+/** recv_sys.pages entry; protected by recv_sys.mutex */
+struct page_recv_t
+{
+  /** Recovery status: 0=not in progress, 1=log is being applied,
+  -1=log has been applied and the entry may be erased.
+  Transitions from 1 to -1 are NOT protected by recv_sys.mutex. */
+  Atomic_relaxed<int8_t> being_processed{0};
+  /** Whether reading the page will be skipped */
+  bool skip_read= false;
+  /** Latest written byte offset when applying the log records.
+  @see mtr_t::m_last_offset */
+  uint16_t last_offset= 1;
+  /** log records for a page */
+  class recs_t
+  {
+    /** The first log record */
+    log_rec_t *head= nullptr;
+    /** The last log record */
+    log_rec_t *tail= nullptr;
+    friend struct page_recv_t;
+  public:
+    /** Append a redo log snippet for the page
+    @param recs log snippet */
+    void append(log_rec_t* recs)
+    {
+      if (tail)
+        tail->next= recs;
+      else
+        head= recs;
+      tail= recs;
+    }
+    /** Remove the last records for the page
+    @param start_lsn   start of the removed log */
+    ATTRIBUTE_COLD void rewind(lsn_t start_lsn);
+
+    /** @return the last log snippet */
+    const log_rec_t* last() const { return tail; }
+    /** @return the last log snippet */
+    log_rec_t* last() { return tail; }
+
+    class iterator
+    {
+      log_rec_t *cur;
+    public:
+      iterator(log_rec_t* rec) : cur(rec) {}
+      log_rec_t* operator*() const { return cur; }
+      iterator &operator++() { cur= cur->next; return *this; }
+      bool operator!=(const iterator& i) const { return cur != i.cur; }
+    };
+    iterator begin() { return head; }
+    iterator end() { return NULL; }
+    bool empty() const { ut_ad(!head == !tail); return !head; }
+    /** Clear and free the records; @see recv_sys_t::add() */
+    void clear();
+  } log;
+
+  /** Trim old log records for a page.
+  @param start_lsn oldest log sequence number to preserve
+  @return whether all the log for the page was trimmed */
+  inline bool trim(lsn_t start_lsn);
+  /** Ignore any earlier redo log records for this page. */
+  inline void will_not_read();
+};
+
+/** A page initialization operation that was parsed from the redo log */
+struct recv_init
+{
+  /** log sequence number of the page initialization */
+  lsn_t lsn;
+  /** Whether btr_page_create() avoided a read of the page.
+  At the end of the last recovery batch, mark_ibuf_exist()
+  will mark pages for which this flag is set. */
+  bool created;
+};
+
+/** Recovery system data structure */
+struct recv_sys_t
+{
+  using init= recv_init;
+
+  /** mutex protecting this as well as some of page_recv_t */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
+private:
+  /** set when finding a corrupt log block or record, or there is a
+  log parsing buffer overflow */
+  bool found_corrupt_log;
+  /** set when an inconsistency with the file system contents is detected
+  during log scan or apply */
+  bool found_corrupt_fs;
+public:
+  /** @return maximum guaranteed size of a mini-transaction on recovery */
+  static constexpr size_t MTR_SIZE_MAX{1U << 20};
+
+  /** whether we are applying redo log records during crash recovery */
+  bool recovery_on;
+  /** whether recv_recover_page(), invoked from buf_page_t::read_complete(),
+  should apply log records*/
+  bool apply_log_recs;
+  /** number of bytes in log_sys.buf */
+  size_t len;
+  /** start offset of non-parsed log records in log_sys.buf */
+  size_t offset;
+  /** log sequence number of the first non-parsed record */
+  lsn_t lsn;
+  /** log sequence number of the last parsed mini-transaction */
+  lsn_t scanned_lsn;
+  /** log sequence number at the end of the FILE_CHECKPOINT record, or 0 */
+  lsn_t file_checkpoint;
+  /** the time when progress was last reported */
+  time_t progress_time;
+
+  using map = std::map<const page_id_t, page_recv_t,
+                       std::less<const page_id_t>,
+                       ut_allocator<std::pair<const page_id_t, page_recv_t>>>;
+  /** buffered records waiting to be applied to pages */
+  map pages;
+
+private:
+  /** iterator to pages, used by parse() */
+  map::iterator pages_it;
+
+  /** Process a record that indicates that a tablespace size is being shrunk.
+  @param page_id first page that is not in the file
+  @param lsn     log sequence number of the shrink operation */
+  inline void trim(const page_id_t page_id, lsn_t lsn);
+
+  /** Undo tablespaces for which truncate has been logged
+  (indexed by page_id_t::space() - srv_undo_space_id_start) */
+  struct trunc
+  {
+    /** log sequence number of FILE_CREATE, or 0 if none */
+    lsn_t lsn;
+    /** truncated size of the tablespace, or 0 if not truncated */
+    unsigned pages;
+  } truncated_undo_spaces[127];
+
+public:
+  /** The contents of the doublewrite buffer */
+  recv_dblwr_t dblwr;
+
+  __attribute__((warn_unused_result)) 
+  inline dberr_t read(os_offset_t offset, span<byte> buf);
+  inline size_t files_size();
+  void close_files();
+
+  /** Advance pages_it if it matches the iterator */
+  void pages_it_invalidate(const map::iterator &p)
+  {
+    mysql_mutex_assert_owner(&mutex);
+    if (pages_it == p)
+      pages_it++;
+  }
+  /** Invalidate pages_it if it points to the given tablespace */
+  void pages_it_invalidate(uint32_t space_id)
+  {
+    mysql_mutex_assert_owner(&mutex);
+    if (pages_it != pages.end() && pages_it->first.space() == space_id)
+      pages_it= pages.end();
+  }
+
+private:
+  /** Attempt to initialize a page based on redo log records.
+  @param p        iterator
+  @param mtr      mini-transaction
+  @param b        pre-allocated buffer pool block
+  @param init     page initialization
+  @return the recovered block
+  @retval nullptr if the page cannot be initialized based on log records
+  @retval -1      if the page cannot be recovered due to corruption */
+  inline buf_block_t *recover_low(const map::iterator &p, mtr_t &mtr,
+                                  buf_block_t *b, init &init);
+  /** Attempt to initialize a page based on redo log records.
+  @param page_id  page identifier
+  @return the recovered block
+  @retval nullptr if the page cannot be initialized based on log records
+  @retval -1      if the page cannot be recovered due to corruption */
+  ATTRIBUTE_COLD buf_block_t *recover_low(const page_id_t page_id);
+
+  /** All found log files (multiple ones are possible if we are upgrading
+  from before MariaDB Server 10.5.1) */
+  std::vector<log_file_t> files;
+
+  /** Base node of the redo block list.
+  List elements are linked via buf_block_t::unzip_LRU. */
+  UT_LIST_BASE_NODE_T(buf_block_t) blocks;
+
+  /** Allocate a block from the buffer pool for recv_sys.pages */
+  ATTRIBUTE_COLD buf_block_t *add_block();
+
+  /** Wait for buffer pool to become available.
+  @param pages number of buffer pool pages needed */
+  ATTRIBUTE_COLD void wait_for_pool(size_t pages);
+
+  /** Free log for processed pages. */
+  void garbage_collect();
+
+  /** Apply a recovery batch.
+  @param space_id       current tablespace identifier
+  @param space          current tablespace
+  @param free_block     spare buffer block
+  @param last_batch     whether it is possible to write more redo log
+  @return whether the caller must provide a new free_block */
+  bool apply_batch(uint32_t space_id, fil_space_t *&space,
+                   buf_block_t *&free_block, bool last_batch);
+
+public:
+  /** Apply buffered log to persistent data pages.
+  @param last_batch     whether it is possible to write more redo log */
+  void apply(bool last_batch);
+
+#ifdef UNIV_DEBUG
+  /** whether all redo log in the current batch has been applied */
+  bool after_apply= false;
+#endif
+  /** Initialize the redo log recovery subsystem. */
+  void create();
+
+  /** Free most recovery data structures. */
+  void debug_free();
+
+  /** Clean up after create() */
+  void close();
+
+  bool is_initialised() const { return scanned_lsn != 0; }
+
+  /** Find the latest checkpoint.
+  @return error code or DB_SUCCESS */
+  dberr_t find_checkpoint();
+
+  /** Register a redo log snippet for a page.
+  @param it       page iterator
+  @param start_lsn start LSN of the mini-transaction
+  @param lsn      @see mtr_t::commit_lsn()
+  @param l        redo log snippet
+  @param len      length of l, in bytes
+  @return whether we ran out of memory */
+  bool add(map::iterator it, lsn_t start_lsn, lsn_t lsn,
+           const byte *l, size_t len);
+
+  /** Parsing result */
+  enum parse_mtr_result {
+    /** a record was successfully parsed */
+    OK,
+    /** the log ended prematurely (need to read more) */
+    PREMATURE_EOF,
+    /** the end of the log was reached */
+    GOT_EOF,
+    /** parse<true>(l, false) ran out of memory */
+    GOT_OOM
+  };
+
+private:
+  /** Parse and register one log_t::FORMAT_10_8 mini-transaction.
+  @tparam store     whether to store the records
+  @param  l         log data source
+  @param  if_exists if store: whether to check if the tablespace exists */
+  template<typename source,bool store>
+  inline parse_mtr_result parse(source &l, bool if_exists) noexcept;
+
+  /** Rewind a mini-transaction when parse() runs out of memory.
+  @param  l         log data source
+  @param  begin     start of the mini-transaction */
+  template<typename source>
+  ATTRIBUTE_COLD void rewind(source &l, source &begin) noexcept;
+
+  /** Report progress in terms of LSN or pages remaining */
+  ATTRIBUTE_COLD void report_progress() const;
+public:
+  /** Parse and register one log_t::FORMAT_10_8 mini-transaction,
+  handling log_sys.is_pmem() buffer wrap-around.
+  @tparam store     whether to store the records
+  @param  if_exists if store: whether to check if the tablespace exists */
+  template<bool store>
+  static parse_mtr_result parse_mtr(bool if_exists) noexcept;
+
+  /** Parse and register one log_t::FORMAT_10_8 mini-transaction,
+  handling log_sys.is_pmem() buffer wrap-around.
+  @tparam store     whether to store the records
+  @param  if_exists if store: whether to check if the tablespace exists */
+  template<bool store>
+  static parse_mtr_result parse_pmem(bool if_exists) noexcept
+#ifdef HAVE_PMEM
+    ;
+#else
+  { return parse_mtr<store>(if_exists); }
+#endif
+
+  /** Erase log records for a page. */
+  void erase(map::iterator p);
+
+  /** Clear a fully processed set of stored redo log records. */
+  void clear();
+
+  /** Determine whether redo log recovery progress should be reported.
+  @param time  the current time
+  @return whether progress should be reported
+  (the last report was at least 15 seconds ago) */
+  bool report(time_t time);
+
+  /** The alloc() memory alignment, in bytes */
+  static constexpr size_t ALIGNMENT= sizeof(size_t);
+
+  /** Free a redo log snippet.
+  @param data buffer allocated in add() */
+  inline void free(const void *data);
+
+  /** Remove records for a corrupted page.
+  This function should only be called when innodb_force_recovery is set.
+  @param page_id  corrupted page identifier */
+  ATTRIBUTE_COLD void free_corrupted_page(page_id_t page_id);
+
+  /** Flag data file corruption during recovery. */
+  ATTRIBUTE_COLD void set_corrupt_fs();
+  /** Flag log file corruption during recovery. */
+  ATTRIBUTE_COLD void set_corrupt_log();
+
+  /** @return whether data file corruption was found */
+  bool is_corrupt_fs() const { return UNIV_UNLIKELY(found_corrupt_fs); }
+  /** @return whether log file corruption was found */
+  bool is_corrupt_log() const { return UNIV_UNLIKELY(found_corrupt_log); }
+
+  /** Attempt to initialize a page based on redo log records.
+  @param page_id  page identifier
+  @return the recovered block
+  @retval nullptr if the page cannot be initialized based on log records
+  @retval -1      if the page cannot be recovered due to corruption */
+  buf_block_t *recover(const page_id_t page_id)
+  {
+    return UNIV_UNLIKELY(recovery_on) ? recover_low(page_id) : nullptr;
+  }
+
+  /** Try to recover a tablespace that was not readable earlier
+  @param p          iterator
+  @param name       tablespace file name
+  @param free_block spare buffer block
+  @return recovered tablespace
+  @retval nullptr if recovery failed */
+  fil_space_t *recover_deferred(const map::iterator &p,
+                                const std::string &name,
+                                buf_block_t *&free_block);
+};
+
+/** The recovery system */
+extern recv_sys_t	recv_sys;
+
+/** If the following is TRUE, the buffer pool file pages must be invalidated
+after recovery and no ibuf operations are allowed; this will be set if
+recv_sys.pages becomes too full, and log records must be merged
+to file pages already before the recovery is finished: in this case no
+ibuf operations are allowed, as they could modify the pages read in the
+buffer pool before the pages have been recovered to the up-to-date state.
+
+TRUE means that recovery is running and no operations on the log files
+are allowed yet: the variable name is misleading. */
+extern bool		recv_no_ibuf_operations;
+/** TRUE when recv_init_crash_recovery() has been called. */
+extern bool		recv_needed_recovery;
+#ifdef UNIV_DEBUG
+/** whether writing to the redo log is forbidden;
+protected by exclusive log_sys.latch. */
+extern bool recv_no_log_write;
+#endif /* UNIV_DEBUG */
+
+/** TRUE if buf_page_is_corrupted() should check if the log sequence
+number (FIL_PAGE_LSN) is in the future.  Initially FALSE, and set by
+recv_recovery_from_checkpoint_start(). */
+extern bool		recv_lsn_checks_on;
diff --git a/storage/innobase/include/log0types.h b/storage/innobase/include/log0types.h
new file mode 100644
index 00000000..df87968d
--- /dev/null
+++ b/storage/innobase/include/log0types.h
@@ -0,0 +1,38 @@
+/*****************************************************************************
+
+Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0types.h
+Log types
+
+Created 2013-03-15 Sunny Bains
+*******************************************************/
+
+#ifndef log0types_h
+#define log0types_h
+
+#include "univ.i"
+
+/* Type used for all log sequence number storage and arithmetics */
+typedef	ib_uint64_t		lsn_t;
+
+#define LSN_MAX			IB_UINT64_MAX
+
+#define LSN_PF			UINT64PF
+
+#endif /* log0types_h */
diff --git a/storage/innobase/include/mach0data.h b/storage/innobase/include/mach0data.h
new file mode 100644
index 00000000..79cbd7d1
--- /dev/null
+++ b/storage/innobase/include/mach0data.h
@@ -0,0 +1,375 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/mach0data.h
+Utilities for converting data from the database file
+to the machine format.
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef mach0data_h
+#define mach0data_h
+
+#include "univ.i"
+#include "mtr0types.h"
+
+#ifndef UNIV_INNOCHECKSUM
+
+/* The data and all fields are always stored in a database file
+in the same format: ascii, big-endian, ... .
+All data in the files MUST be accessed using the functions in this
+module. */
+
+/*******************************************************//**
+The following function is used to store data in one byte. */
+UNIV_INLINE
+void
+mach_write_to_1(
+/*============*/
+	byte*	b,	/*!< in: pointer to byte where to store */
+	ulint	n);	 /*!< in: ulint integer to be stored, >= 0, < 256 */
+/** The following function is used to fetch data from one byte.
+@param[in]	b	pointer to a byte to read
+@return ulint integer, >= 0, < 256 */
+UNIV_INLINE
+uint8_t
+mach_read_from_1(
+	const byte*	b)
+	MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************//**
+The following function is used to store data in two consecutive
+bytes. We store the most significant byte to the lower address. */
+UNIV_INLINE
+void
+mach_write_to_2(
+/*============*/
+	byte*	b,	/*!< in: pointer to two bytes where to store */
+	ulint	n);	 /*!< in: ulint integer to be stored, >= 0, < 64k */
+#endif /* !UNIV_INNOCHECKSUM */
+/** The following function is used to fetch data from 2 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in]	b	pointer to 2 bytes where to store
+@return 2-byte integer, >= 0, < 64k */
+UNIV_INLINE
+uint16_t
+mach_read_from_2(
+	const byte*	b)
+	MY_ATTRIBUTE((warn_unused_result));
+
+#ifndef UNIV_INNOCHECKSUM
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+to the canonical format, for fast bytewise equality test
+against memory.
+@return 16-bit integer in canonical format */
+UNIV_INLINE
+uint16
+mach_encode_2(
+/*==========*/
+	ulint	n)	/*!< in: integer in machine-dependent format */
+	MY_ATTRIBUTE((const));
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+from the canonical format, for fast bytewise equality test
+against memory.
+@return integer in machine-dependent format */
+UNIV_INLINE
+ulint
+mach_decode_2(
+/*==========*/
+	uint16	n)	/*!< in: 16-bit integer in canonical format */
+	MY_ATTRIBUTE((const));
+/*******************************************************//**
+The following function is used to store data in 3 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_3(
+/*============*/
+	byte*	b,	/*!< in: pointer to 3 bytes where to store */
+	ulint	n);	 /*!< in: ulint integer to be stored */
+/** The following function is used to fetch data from 3 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in]	b	pointer to 3 bytes to read
+@return 32 bit integer */
+UNIV_INLINE
+uint32_t
+mach_read_from_3(
+	const byte*	b)
+	MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************//**
+The following function is used to store data in four consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_4(
+/*============*/
+	byte*	b,	/*!< in: pointer to four bytes where to store */
+	ulint	n);	 /*!< in: ulint integer to be stored */
+/** The following function is used to fetch data from 4 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in]	b	pointer to 4 bytes to read
+@return 32 bit integer */
+UNIV_INLINE
+uint32_t
+mach_read_from_4(
+	const byte*	b)
+	MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************//**
+Writes a ulint in a compressed form (1..5 bytes).
+@return stored size in bytes */
+UNIV_INLINE
+ulint
+mach_write_compressed(
+/*==================*/
+	byte*	b,	/*!< in: pointer to memory where to store */
+	ulint	n);	/*!< in: ulint integer to be stored */
+/*********************************************************//**
+Returns the size of an ulint when written in the compressed form.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_get_compressed_size(
+/*=====================*/
+	ulint	n)	/*!< in: ulint integer to be stored */
+	MY_ATTRIBUTE((const));
+/** Read a 32-bit integer in a compressed form.
+@param[in,out]	b	pointer to memory where to read;
+advanced by the number of bytes consumed
+@return unsigned value */
+UNIV_INLINE
+ib_uint32_t
+mach_read_next_compressed(
+	const byte**	b);
+/*******************************************************//**
+The following function is used to store data in 6 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_6(
+/*============*/
+	byte*		b,	/*!< in: pointer to 6 bytes where to store */
+	ib_uint64_t	id);	/*!< in: 48-bit integer */
+/********************************************************//**
+The following function is used to fetch data from 6 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 48-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_6(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 6 bytes */
+	MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************//**
+The following function is used to store data in 7 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_7(
+/*============*/
+	byte*		b,	/*!< in: pointer to 7 bytes where to store */
+	ib_uint64_t	n);	/*!< in: 56-bit integer */
+/********************************************************//**
+The following function is used to fetch data from 7 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 56-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_7(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 7 bytes */
+	MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_8(
+/*============*/
+	void*		b,	/*!< in: pointer to 8 bytes where to store */
+	ib_uint64_t	n);	/*!< in: 64-bit integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 64-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_8(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 8 bytes */
+	MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************//**
+Writes a 64-bit integer in a compressed form (5..9 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_u64_write_compressed(
+/*======================*/
+	byte*		b,	/*!< in: pointer to memory where to store */
+	ib_uint64_t	n);	/*!< in: 64-bit integer to be stored */
+/** Read a 64-bit integer in a compressed form.
+@param[in,out]	b	pointer to memory where to read;
+advanced by the number of bytes consumed
+@return unsigned value */
+UNIV_INLINE
+ib_uint64_t
+mach_u64_read_next_compressed(
+	const byte**	b);
+/*********************************************************//**
+Writes a 64-bit integer in a compressed form (1..11 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_u64_write_much_compressed(
+/*===========================*/
+	byte*		b,	/*!< in: pointer to memory where to store */
+	ib_uint64_t	n);	/*!< in: 64-bit integer to be stored */
+/*********************************************************//**
+Reads a 64-bit integer in a compressed form.
+@return the value read */
+UNIV_INLINE
+ib_uint64_t
+mach_u64_read_much_compressed(
+/*==========================*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************//**
+Reads a double. It is stored in a little-endian format.
+@return double read */
+UNIV_INLINE
+double
+mach_double_read(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+	MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************//**
+Writes a double. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_double_write(
+/*==============*/
+	byte*	b,	/*!< in: pointer to memory where to write */
+	double	d);	/*!< in: double */
+/*********************************************************//**
+Reads a float. It is stored in a little-endian format.
+@return float read */
+UNIV_INLINE
+float
+mach_float_read(
+/*============*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+	MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************//**
+Writes a float. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_float_write(
+/*=============*/
+	byte*	b,	/*!< in: pointer to memory where to write */
+	float	d);	/*!< in: float */
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_n_little_endian(
+/*===========================*/
+	const byte*	buf,		/*!< in: from where to read */
+	ulint		buf_size)	/*!< in: from how many bytes to read */
+	MY_ATTRIBUTE((warn_unused_result));
+
+
+/** Reads a 64 bit stored in big endian format
+@param	buf		From where to read
+@return uint64_t */
+UNIV_INLINE
+uint64_t
+mach_read_uint64_little_endian(const byte* buf)
+{
+#ifdef WORDS_BIGENDIAN
+  return
+    uint64_t(buf[0])       | uint64_t(buf[1]) << 8 |
+    uint64_t(buf[2]) << 16 | uint64_t(buf[3]) << 24 |
+    uint64_t(buf[4]) << 32 | uint64_t(buf[5]) << 40 |
+    uint64_t(buf[6]) << 48 | uint64_t(buf[7]) << 56;
+#else
+  uint64_t n;
+  memcpy(&n, buf, sizeof(uint64_t));
+  return n;
+#endif
+}
+
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_n_little_endian(
+/*==========================*/
+	byte*	dest,		/*!< in: where to write */
+	ulint	dest_size,	/*!< in: into how many bytes to write */
+	ulint	n);		/*!< in: unsigned long int to write */
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_2_little_endian(
+/*===========================*/
+	const byte*	buf)		/*!< in: from where to read */
+	MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_2_little_endian(
+/*==========================*/
+	byte*	dest,		/*!< in: where to write */
+	ulint	n);		/*!< in: unsigned long int to write */
+/*********************************************************//**
+Convert integral type from storage byte order (big endian) to
+host byte order.
+@return integer value */
+UNIV_INLINE
+ib_uint64_t
+mach_read_int_type(
+/*===============*/
+	const byte*	src,		/*!< in: where to read from */
+	ulint		len,		/*!< in: length of src */
+	ibool		unsigned_type);	/*!< in: signed or unsigned flag */
+
+/*************************************************************
+Convert a ulonglong integer from host byte order to (big-endian)
+storage byte order. */
+UNIV_INLINE
+void
+mach_write_ulonglong(
+/*=================*/
+	byte*		dest,		/*!< in: where to write */
+	ulonglong	src,		/*!< in: where to read from */
+	ulint		len,		/*!< in: length of dest */
+	bool		usign);		/*!< in: signed or unsigned flag */
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#include "mach0data.inl"
+
+#endif
diff --git a/storage/innobase/include/mach0data.inl b/storage/innobase/include/mach0data.inl
new file mode 100644
index 00000000..2f970fd2
--- /dev/null
+++ b/storage/innobase/include/mach0data.inl
@@ -0,0 +1,837 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/mach0data.ic
+Utilities for converting data from the database file
+to the machine format.
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "mtr0types.h"
+#include "ut0byte.h"
+
+/*******************************************************//**
+The following function is used to store data in one byte. */
+UNIV_INLINE
+void
+mach_write_to_1(
+/*============*/
+	byte*	b,	/*!< in: pointer to byte where to store */
+	ulint	n)	/*!< in: ulint integer to be stored, >= 0, < 256 */
+{
+	ut_ad((n & ~0xFFUL) == 0);
+
+	b[0] = (byte) n;
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*******************************************************//**
+The following function is used to store data in two consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_2(
+/*============*/
+	byte*	b,	/*!< in: pointer to two bytes where to store */
+	ulint	n)	/*!< in: ulint integer to be stored */
+{
+	ut_ad((n & ~0xFFFFUL) == 0);
+
+	b[0] = (byte)(n >> 8);
+	b[1] = (byte)(n);
+}
+
+/** The following function is used to fetch data from one byte.
+@param[in]	b	pointer to a byte to read
+@return ulint integer, >= 0, < 256 */
+UNIV_INLINE
+uint8_t
+mach_read_from_1(
+	const byte*	b)
+{
+	return(uint8_t(*b));
+}
+
+/** The following function is used to fetch data from 2 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in]	b	pointer to 2 bytes to read
+@return 2-byte integer, >= 0, < 64k */
+UNIV_INLINE
+uint16_t
+mach_read_from_2(
+	const byte*	b)
+{
+	return(uint16_t(uint16_t(b[0]) << 8 | b[1]));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+to the canonical format, for fast bytewise equality test
+against memory.
+@return 16-bit integer in canonical format */
+UNIV_INLINE
+uint16
+mach_encode_2(
+/*==========*/
+	ulint	n)	/*!< in: integer in machine-dependent format */
+{
+	uint16	ret;
+	ut_ad(2 == sizeof ret);
+	mach_write_to_2((byte*) &ret, n);
+	return(ret);
+}
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+from the canonical format, for fast bytewise equality test
+against memory.
+@return integer in machine-dependent format */
+UNIV_INLINE
+ulint
+mach_decode_2(
+/*==========*/
+	uint16	n)	/*!< in: 16-bit integer in canonical format */
+{
+	ut_ad(2 == sizeof n);
+	return(mach_read_from_2((const byte*) &n));
+}
+
+/*******************************************************//**
+The following function is used to store data in 3 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_3(
+/*============*/
+	byte*	b,	/*!< in: pointer to 3 bytes where to store */
+	ulint	n)	/*!< in: ulint integer to be stored */
+{
+	ut_ad((n & ~0xFFFFFFUL) == 0);
+
+	b[0] = (byte)(n >> 16);
+	b[1] = (byte)(n >> 8);
+	b[2] = (byte)(n);
+}
+
+/** The following function is used to fetch data from 3 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in]	b	pointer to 3 bytes to read
+@return uint32_t integer */
+UNIV_INLINE
+uint32_t
+mach_read_from_3(
+	const byte*	b)
+{
+	return( (static_cast<uint32_t>(b[0]) << 16)
+		| (static_cast<uint32_t>(b[1]) << 8)
+		| static_cast<uint32_t>(b[2])
+		);
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*******************************************************//**
+The following function is used to store data in four consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_4(
+/*============*/
+	byte*	b,	/*!< in: pointer to four bytes where to store */
+	ulint	n)	/*!< in: ulint integer to be stored */
+{
+	b[0] = (byte)(n >> 24);
+	b[1] = (byte)(n >> 16);
+	b[2] = (byte)(n >> 8);
+	b[3] = (byte) n;
+}
+
+/** The following function is used to fetch data from 4 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in]	b	pointer to 4 bytes to read
+@return 32 bit integer */
+UNIV_INLINE
+uint32_t
+mach_read_from_4(
+	const byte*	b)
+{
+	return( (static_cast<uint32_t>(b[0]) << 24)
+		| (static_cast<uint32_t>(b[1]) << 16)
+		| (static_cast<uint32_t>(b[2]) << 8)
+		| static_cast<uint32_t>(b[3])
+		);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/*********************************************************//**
+Writes a ulint in a compressed form where the first byte codes the
+length of the stored ulint. We look at the most significant bits of
+the byte. If the most significant bit is zero, it means 1-byte storage,
+else if the 2nd bit is 0, it means 2-byte storage, else if 3rd is 0,
+it means 3-byte storage, else if 4th is 0, it means 4-byte storage,
+else the storage is 5-byte.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_write_compressed(
+/*==================*/
+	byte*	b,	/*!< in: pointer to memory where to store */
+	ulint	n)	/*!< in: ulint integer (< 2^32) to be stored */
+{
+	if (n < 0x80) {
+		/* 0nnnnnnn (7 bits) */
+		mach_write_to_1(b, n);
+		return(1);
+	} else if (n < 0x4000) {
+		/* 10nnnnnn nnnnnnnn (14 bits) */
+		mach_write_to_2(b, n | 0x8000);
+		return(2);
+	} else if (n < 0x200000) {
+		/* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */
+		mach_write_to_3(b, n | 0xC00000);
+		return(3);
+	} else if (n < 0x10000000) {
+		/* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */
+		mach_write_to_4(b, n | 0xE0000000);
+		return(4);
+	} else {
+		/* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */
+		mach_write_to_1(b, 0xF0);
+		mach_write_to_4(b + 1, n);
+		return(5);
+	}
+}
+
+/*********************************************************//**
+Returns the size of a ulint when written in the compressed form.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_get_compressed_size(
+/*=====================*/
+	ulint	n)	/*!< in: ulint integer (< 2^32) to be stored */
+{
+	if (n < 0x80) {
+		/* 0nnnnnnn (7 bits) */
+		return(1);
+	} else if (n < 0x4000) {
+		/* 10nnnnnn nnnnnnnn (14 bits) */
+		return(2);
+	} else if (n < 0x200000) {
+		/* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */
+		return(3);
+	} else if (n < 0x10000000) {
+		/* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */
+		return(4);
+	} else {
+		/* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */
+		return(5);
+	}
+}
+
+/*********************************************************//**
+Reads a ulint in a compressed form.
+@return read integer (< 2^32) */
+UNIV_INLINE
+ulint
+mach_read_compressed(
+/*=================*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+{
+	ulint	val;
+
+	val = mach_read_from_1(b);
+
+	if (val < 0x80) {
+		/* 0nnnnnnn (7 bits) */
+	} else if (val < 0xC0) {
+		/* 10nnnnnn nnnnnnnn (14 bits) */
+		val = mach_read_from_2(b) & 0x3FFF;
+		ut_ad(val > 0x7F);
+	} else if (val < 0xE0) {
+		/* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */
+		val = mach_read_from_3(b) & 0x1FFFFF;
+		ut_ad(val > 0x3FFF);
+	} else if (val < 0xF0) {
+		/* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */
+		val = mach_read_from_4(b) & 0xFFFFFFF;
+		ut_ad(val > 0x1FFFFF);
+	} else {
+		/* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */
+		ut_ad(val == 0xF0);
+		val = mach_read_from_4(b + 1);
+		ut_ad(val > 0xFFFFFFF);
+	}
+
+	return(val);
+}
+
+/** Read a 32-bit integer in a compressed form.
+@param[in,out]	b	pointer to memory where to read;
+advanced by the number of bytes consumed
+@return unsigned value */
+UNIV_INLINE
+ib_uint32_t
+mach_read_next_compressed(
+	const byte**	b)
+{
+	ulint	val = mach_read_from_1(*b);
+
+	if (val < 0x80) {
+		/* 0nnnnnnn (7 bits) */
+		++*b;
+	} else if (val < 0xC0) {
+		/* 10nnnnnn nnnnnnnn (14 bits) */
+		val = mach_read_from_2(*b) & 0x3FFF;
+		ut_ad(val > 0x7F);
+		*b += 2;
+	} else if (val < 0xE0) {
+		/* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */
+		val = mach_read_from_3(*b) & 0x1FFFFF;
+		ut_ad(val > 0x3FFF);
+		*b += 3;
+	} else if (val < 0xF0) {
+		/* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */
+		val = mach_read_from_4(*b) & 0xFFFFFFF;
+		ut_ad(val > 0x1FFFFF);
+		*b += 4;
+	} else {
+		/* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */
+		ut_ad(val == 0xF0);
+		val = mach_read_from_4(*b + 1);
+		ut_ad(val > 0xFFFFFFF);
+		*b += 5;
+	}
+
+	return(static_cast<ib_uint32_t>(val));
+}
+
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_8(
+/*============*/
+	void*		b,	/*!< in: pointer to 8 bytes where to store */
+	ib_uint64_t	n)	/*!< in: 64-bit integer to be stored */
+{
+	mach_write_to_4(static_cast<byte*>(b), (ulint) (n >> 32));
+	mach_write_to_4(static_cast<byte*>(b) + 4, (ulint) n);
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 64-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_8(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 8 bytes */
+{
+	ib_uint64_t	u64;
+
+	u64 = mach_read_from_4(b);
+	u64 <<= 32;
+	u64 |= mach_read_from_4(b + 4);
+
+	return(u64);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/*******************************************************//**
+The following function is used to store data in 7 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_7(
+/*============*/
+	byte*		b,	/*!< in: pointer to 7 bytes where to store */
+	ib_uint64_t	n)	/*!< in: 56-bit integer */
+{
+	mach_write_to_3(b, (ulint) (n >> 32));
+	mach_write_to_4(b + 3, (ulint) n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 7 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 56-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_7(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 7 bytes */
+{
+	return(ut_ull_create(mach_read_from_3(b), mach_read_from_4(b + 3)));
+}
+
+/*******************************************************//**
+The following function is used to store data in 6 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_6(
+/*============*/
+	byte*		b,	/*!< in: pointer to 6 bytes where to store */
+	ib_uint64_t	n)	/*!< in: 48-bit integer */
+{
+	mach_write_to_2(b, (ulint) (n >> 32));
+	mach_write_to_4(b + 2, (ulint) n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 6 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 48-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_6(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 6 bytes */
+{
+	return(ut_ull_create(mach_read_from_2(b), mach_read_from_4(b + 2)));
+}
+
+/*********************************************************//**
+Writes a 64-bit integer in a compressed form (5..9 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_u64_write_compressed(
+/*======================*/
+	byte*		b,	/*!< in: pointer to memory where to store */
+	ib_uint64_t	n)	/*!< in: 64-bit integer to be stored */
+{
+	ulint	size = mach_write_compressed(b, (ulint) (n >> 32));
+	mach_write_to_4(b + size, (ulint) n);
+
+	return(size + 4);
+}
+
+/** Read a 64-bit integer in a compressed form.
+@param[in,out]	b	pointer to memory where to read;
+advanced by the number of bytes consumed
+@return unsigned value */
+UNIV_INLINE
+ib_uint64_t
+mach_u64_read_next_compressed(
+	const byte**	b)
+{
+	ib_uint64_t	val;
+
+	val = mach_read_next_compressed(b);
+	val <<= 32;
+	val |= mach_read_from_4(*b);
+	*b += 4;
+	return(val);
+}
+
+/*********************************************************//**
+Writes a 64-bit integer in a compressed form (1..11 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_u64_write_much_compressed(
+/*===========================*/
+	byte*		b,	/*!< in: pointer to memory where to store */
+	ib_uint64_t	n)	/*!< in: 64-bit integer to be stored */
+{
+	ulint	size;
+
+	if (!(n >> 32)) {
+		return(mach_write_compressed(b, (ulint) n));
+	}
+
+	*b = (byte)0xFF;
+	size = 1 + mach_write_compressed(b + 1, (ulint) (n >> 32));
+
+	size += mach_write_compressed(b + size, (ulint) n & 0xFFFFFFFF);
+
+	return(size);
+}
+
+/*********************************************************//**
+Reads a 64-bit integer in a compressed form.
+@return the value read */
+UNIV_INLINE
+ib_uint64_t
+mach_u64_read_much_compressed(
+/*==========================*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+{
+	ib_uint64_t	n;
+
+	if (*b != 0xFF) {
+		return(mach_read_compressed(b));
+	}
+
+	b++;
+	n = mach_read_next_compressed(&b);
+	n <<= 32;
+	n |= mach_read_compressed(b);
+
+	return(n);
+}
+
+/** Read a 64-bit integer in a compressed form.
+@param[in,out]	b	pointer to memory where to read;
+advanced by the number of bytes consumed
+@return unsigned value */
+UNIV_INLINE
+ib_uint64_t
+mach_read_next_much_compressed(
+	const byte**	b)
+{
+	ib_uint64_t	val = mach_read_from_1(*b);
+
+	if (val < 0x80) {
+		/* 0nnnnnnn (7 bits) */
+		++*b;
+	} else if (val < 0xC0) {
+		/* 10nnnnnn nnnnnnnn (14 bits) */
+		val = mach_read_from_2(*b) & 0x3FFF;
+		ut_ad(val > 0x7F);
+		*b += 2;
+	} else if (val < 0xE0) {
+		/* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */
+		val = mach_read_from_3(*b) & 0x1FFFFF;
+		ut_ad(val > 0x3FFF);
+		*b += 3;
+	} else if (val < 0xF0) {
+		/* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */
+		val = mach_read_from_4(*b) & 0xFFFFFFF;
+		ut_ad(val > 0x1FFFFF);
+		*b += 4;
+	} else if (val == 0xF0) {
+		/* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */
+		val = mach_read_from_4(*b + 1);
+		ut_ad(val > 0xFFFFFFF);
+		*b += 5;
+	} else {
+		/* 11111111 followed by up to 64 bits */
+		ut_ad(val == 0xFF);
+		++*b;
+		val = mach_read_next_compressed(b);
+		ut_ad(val > 0);
+		val <<= 32;
+		val |= mach_read_next_compressed(b);
+	}
+
+	return(val);
+}
+
+/*********************************************************//**
+Reads a double. It is stored in a little-endian format.
+@return double read */
+UNIV_INLINE
+double
+mach_double_read(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+{
+	double	d;
+	ulint	i;
+	byte*	ptr;
+
+	ptr = (byte*) &d;
+
+	for (i = 0; i < sizeof(double); i++) {
+#ifdef WORDS_BIGENDIAN
+		ptr[sizeof(double) - i - 1] = b[i];
+#else
+		ptr[i] = b[i];
+#endif
+	}
+
+	return(d);
+}
+
+/*********************************************************//**
+Writes a double. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_double_write(
+/*==============*/
+	byte*	b,	/*!< in: pointer to memory where to write */
+	double	d)	/*!< in: double */
+{
+	ulint	i;
+	byte*	ptr;
+
+	ptr = (byte*) &d;
+
+	for (i = 0; i < sizeof(double); i++) {
+#ifdef WORDS_BIGENDIAN
+		b[i] = ptr[sizeof(double) - i - 1];
+#else
+		b[i] = ptr[i];
+#endif
+	}
+}
+
+/*********************************************************//**
+Reads a float. It is stored in a little-endian format.
+@return float read */
+UNIV_INLINE
+float
+mach_float_read(
+/*============*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+{
+	float	d;
+	ulint	i;
+	byte*	ptr;
+
+	ptr = (byte*) &d;
+
+	for (i = 0; i < sizeof(float); i++) {
+#ifdef WORDS_BIGENDIAN
+		ptr[sizeof(float) - i - 1] = b[i];
+#else
+		ptr[i] = b[i];
+#endif
+	}
+
+	return(d);
+}
+
+/*********************************************************//**
+Writes a float. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_float_write(
+/*=============*/
+	byte*	b,	/*!< in: pointer to memory where to write */
+	float	d)	/*!< in: float */
+{
+	ulint	i;
+	byte*	ptr;
+
+	ptr = (byte*) &d;
+
+	for (i = 0; i < sizeof(float); i++) {
+#ifdef WORDS_BIGENDIAN
+		b[i] = ptr[sizeof(float) - i - 1];
+#else
+		b[i] = ptr[i];
+#endif
+	}
+}
+
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_n_little_endian(
+/*===========================*/
+	const byte*	buf,		/*!< in: from where to read */
+	ulint		buf_size)	/*!< in: from how many bytes to read */
+{
+	ulint	n	= 0;
+	const byte*	ptr;
+
+	ut_ad(buf_size > 0);
+
+	ptr = buf + buf_size;
+
+	for (;;) {
+		ptr--;
+
+		n = n << 8;
+
+		n += (ulint)(*ptr);
+
+		if (ptr == buf) {
+			break;
+		}
+	}
+
+	return(n);
+}
+
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_n_little_endian(
+/*==========================*/
+	byte*	dest,		/*!< in: where to write */
+	ulint	dest_size,	/*!< in: into how many bytes to write */
+	ulint	n)		/*!< in: unsigned long int to write */
+{
+	byte*	end;
+
+	ut_ad(dest_size <= sizeof(ulint));
+	ut_ad(dest_size > 0);
+
+	end = dest + dest_size;
+
+	for (;;) {
+		*dest = (byte)(n & 0xFF);
+
+		n = n >> 8;
+
+		dest++;
+
+		if (dest == end) {
+			break;
+		}
+	}
+
+	ut_ad(n == 0);
+}
+
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_2_little_endian(
+/*===========================*/
+	const byte*	buf)		/*!< in: from where to read */
+{
+	return((ulint)(buf[0]) | ((ulint)(buf[1]) << 8));
+}
+
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_2_little_endian(
+/*==========================*/
+	byte*	dest,		/*!< in: where to write */
+	ulint	n)		/*!< in: unsigned long int to write */
+{
+	ut_ad(n < 256 * 256);
+
+	*dest = (byte)(n & 0xFFUL);
+
+	n = n >> 8;
+	dest++;
+
+	*dest = (byte)(n & 0xFFUL);
+}
+
+/*********************************************************//**
+Convert integral type from storage byte order (big endian) to
+host byte order.
+@return integer value */
+UNIV_INLINE
+ib_uint64_t
+mach_read_int_type(
+/*===============*/
+	const byte*	src,		/*!< in: where to read from */
+	ulint		len,		/*!< in: length of src */
+	ibool		unsigned_type)	/*!< in: signed or unsigned flag */
+{
+	/* XXX this can be optimized on big-endian machines */
+
+	uintmax_t	ret;
+	uint		i;
+
+	if (unsigned_type || (src[0] & 0x80)) {
+
+		ret = 0x0000000000000000ULL;
+	} else {
+
+		ret = 0xFFFFFFFFFFFFFF00ULL;
+	}
+
+	if (unsigned_type) {
+
+		ret |= src[0];
+	} else {
+
+		ret |= src[0] ^ 0x80;
+	}
+
+	for (i = 1; i < len; i++) {
+		ret <<= 8;
+		ret |= src[i];
+	}
+
+	return(ret);
+}
+/*********************************************************//**
+Swap byte ordering. */
+UNIV_INLINE
+void
+mach_swap_byte_order(
+/*=================*/
+        byte*           dest,           /*!< out: where to write */
+        const byte*     from,           /*!< in: where to read from */
+        ulint           len)            /*!< in: length of src */
+{
+        ut_ad(len > 0);
+        ut_ad(len <= 8);
+
+        dest += len;
+
+        switch (len & 0x7) {
+        case 0: *--dest = *from++; /* fall through */
+        case 7: *--dest = *from++; /* fall through */
+        case 6: *--dest = *from++; /* fall through */
+        case 5: *--dest = *from++; /* fall through */
+        case 4: *--dest = *from++; /* fall through */
+        case 3: *--dest = *from++; /* fall through */
+        case 2: *--dest = *from++; /* fall through */
+        case 1: *--dest = *from;
+        }
+}
+
+/*************************************************************
+Convert a ulonglong integer from host byte order to (big-endian)
+storage byte order. */
+UNIV_INLINE
+void
+mach_write_ulonglong(
+/*=================*/
+	byte*		dest,		/*!< in: where to write */
+	ulonglong	src,		/*!< in: where to read from */
+	ulint		len,		/*!< in: length of dest */
+	bool		usign)		/*!< in: signed or unsigned flag */
+{
+	byte*		ptr = reinterpret_cast<byte*>(&src);
+
+	ut_ad(len <= sizeof(ulonglong));
+
+#ifdef WORDS_BIGENDIAN
+	memcpy(dest, ptr + (sizeof(src) - len), len);
+#else
+	mach_swap_byte_order(dest, reinterpret_cast<byte*>(ptr), len);
+#endif /* WORDS_BIGENDIAN */
+
+	if (!usign) {
+		*dest ^=  0x80;
+	}
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/mariadb_stats.h b/storage/innobase/include/mariadb_stats.h
new file mode 100644
index 00000000..e9051c0c
--- /dev/null
+++ b/storage/innobase/include/mariadb_stats.h
@@ -0,0 +1,119 @@
+/*****************************************************************************
+
+Copyright (c) 2023, MariaDB Foundation
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#ifndef mariadb_stats_h
+#define mariadb_stats_h
+
+/* Include file to handle mariadbd handler specific stats */
+
+#include "ha_handler_stats.h"
+#include "my_rdtsc.h"
+
+/* Not active threads are ponting to this structure */
+extern thread_local ha_handler_stats mariadb_dummy_stats;
+
+/* Points to either THD->handler_stats or mariad_dummy_stats */
+extern thread_local ha_handler_stats *mariadb_stats;
+
+/*
+  Returns 1 if MariaDB wants engine status
+*/
+
+inline bool mariadb_stats_active()
+{
+  return mariadb_stats->active != 0;
+}
+
+inline bool mariadb_stats_active(ha_handler_stats *stats)
+{
+  return stats->active != 0;
+}
+
+/* The following functions increment different engine status */
+
+inline void mariadb_increment_pages_accessed()
+{
+  mariadb_stats->pages_accessed++;
+}
+
+inline void mariadb_increment_pages_updated(ulonglong count)
+{
+  mariadb_stats->pages_updated+= count;
+}
+
+inline void mariadb_increment_pages_read()
+{
+  mariadb_stats->pages_read_count++;
+}
+
+inline void mariadb_increment_undo_records_read()
+{
+  mariadb_stats->undo_records_read++;
+}
+
+/*
+  The following has to be identical code as measure() in sql_analyze_stmt.h
+
+  One should only call this if mariadb_stats_active() is true.
+*/
+
+inline ulonglong mariadb_measure()
+{
+#if (MY_TIMER_ROUTINE_CYCLES)
+    return my_timer_cycles();
+#else
+    return my_timer_microseconds();
+#endif
+}
+
+/*
+  Call this only of start_time != 0
+  See buf0rea.cc for an example of how to use it efficiently
+*/
+
+inline void mariadb_increment_pages_read_time(ulonglong start_time)
+{
+  ha_handler_stats *stats= mariadb_stats;
+  ulonglong end_time= mariadb_measure();
+  /* Check that we only call this if active, see example! */
+  DBUG_ASSERT(start_time);
+  DBUG_ASSERT(mariadb_stats_active(stats));
+
+  stats->pages_read_time+= (end_time - start_time);
+}
+
+
+/*
+  Helper class to set mariadb_stats temporarly for one call in handler.cc
+*/
+
+class mariadb_set_stats
+{
+public:
+  uint flag;
+  mariadb_set_stats(ha_handler_stats *stats)
+  {
+    mariadb_stats= stats ? stats : &mariadb_dummy_stats;
+  }
+  ~mariadb_set_stats()
+  {
+    mariadb_stats= &mariadb_dummy_stats;
+  }
+};
+
+#endif /* mariadb_stats_h */
diff --git a/storage/innobase/include/mem0mem.h b/storage/innobase/include/mem0mem.h
new file mode 100644
index 00000000..959147a6
--- /dev/null
+++ b/storage/innobase/include/mem0mem.h
@@ -0,0 +1,345 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mem0mem.h
+The memory management
+
+Created 6/9/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef mem0mem_h
+#define mem0mem_h
+
+#include "ut0mem.h"
+#include "ut0rnd.h"
+#include "mach0data.h"
+
+#include <memory>
+
+/* -------------------- MEMORY HEAPS ----------------------------- */
+
+/** A block of a memory heap consists of the info structure
+followed by an area of memory */
+typedef struct mem_block_info_t	mem_block_t;
+
+/** A memory heap is a nonempty linear list of memory blocks */
+typedef mem_block_t		mem_heap_t;
+
+/** Types of allocation for memory heaps: DYNAMIC means allocation from the
+dynamic memory pool of the C compiler, BUFFER means allocation from the
+buffer pool; the latter method is used for very big heaps */
+
+#define MEM_HEAP_DYNAMIC	0	/* the most common type */
+#define MEM_HEAP_BUFFER		1
+#define MEM_HEAP_BTR_SEARCH	2	/* this flag can optionally be
+					ORed to MEM_HEAP_BUFFER, in which
+					case heap->free_block is used in
+					some cases for memory allocations,
+					and if it's NULL, the memory
+					allocation functions can return
+					NULL. */
+
+/** Different type of heaps in terms of which datastructure is using them */
+#define MEM_HEAP_FOR_BTR_SEARCH		(MEM_HEAP_BTR_SEARCH | MEM_HEAP_BUFFER)
+#define MEM_HEAP_FOR_LOCK_HEAP		(MEM_HEAP_BUFFER)
+
+/** The following start size is used for the first block in the memory heap if
+the size is not specified, i.e., 0 is given as the parameter in the call of
+create. The standard size is the maximum (payload) size of the blocks used for
+allocations of small buffers. */
+
+#define MEM_BLOCK_START_SIZE		64
+#define MEM_BLOCK_STANDARD_SIZE		\
+	(srv_page_size >= 16384 ? 8000 : MEM_MAX_ALLOC_IN_BUF)
+
+/** If a memory heap is allowed to grow into the buffer pool, the following
+is the maximum size for a single allocated buffer: */
+#define MEM_MAX_ALLOC_IN_BUF		(srv_page_size - 200 + REDZONE_SIZE)
+
+/** Space needed when allocating for a user a field of length N.
+The space is allocated only in multiples of UNIV_MEM_ALIGNMENT.  */
+#define MEM_SPACE_NEEDED(N) UT_CALC_ALIGN((N), UNIV_MEM_ALIGNMENT)
+
+#ifdef UNIV_DEBUG
+/** Macro for memory heap creation.
+@param[in]	size		Desired start block size. */
+# define mem_heap_create(size)					\
+	 mem_heap_create_func((size), __FILE__, __LINE__, MEM_HEAP_DYNAMIC)
+
+/** Macro for memory heap creation.
+@param[in]	size		Desired start block size.
+@param[in]	type		Heap type */
+# define mem_heap_create_typed(size, type)			\
+	 mem_heap_create_func((size), __FILE__, __LINE__, (type))
+
+#else /* UNIV_DEBUG */
+/** Macro for memory heap creation.
+@param[in]	size		Desired start block size. */
+# define mem_heap_create(size) mem_heap_create_func((size), MEM_HEAP_DYNAMIC)
+
+/** Macro for memory heap creation.
+@param[in]	size		Desired start block size.
+@param[in]	type		Heap type */
+# define mem_heap_create_typed(size, type)			\
+	 mem_heap_create_func((size), (type))
+
+#endif /* UNIV_DEBUG */
+
+/** Creates a memory heap.
+NOTE: Use the corresponding macros instead of this function.
+A single user buffer of 'size' will fit in the block.
+0 creates a default size block.
+@param[in]	size		Desired start block size.
+@param[in]	file_name	File name where created
+@param[in]	line		Line where created
+@param[in]	type		Heap type
+@return own: memory heap, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+mem_heap_t*
+mem_heap_create_func(
+	ulint		size,
+#ifdef UNIV_DEBUG
+	const char*	file_name,
+	unsigned	line,
+#endif /* UNIV_DEBUG */
+	ulint		type);
+
+/** Frees the space occupied by a memory heap.
+NOTE: Use the corresponding macro instead of this function.
+@param[in]	heap	Heap to be freed */
+UNIV_INLINE
+void
+mem_heap_free(
+	mem_heap_t*	heap);
+
+/** Allocates and zero-fills n bytes of memory from a memory heap.
+@param[in]	heap	memory heap
+@param[in]	n	number of bytes; if the heap is allowed to grow into
+the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF
+@return allocated, zero-filled storage */
+UNIV_INLINE
+void*
+mem_heap_zalloc(
+	mem_heap_t*	heap,
+	ulint		n);
+
+/** Allocates n bytes of memory from a memory heap.
+@param[in]	heap	memory heap
+@param[in]	n	number of bytes; if the heap is allowed to grow into
+the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF
+@return allocated storage, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+void*
+mem_heap_alloc(
+	mem_heap_t*	heap,
+	ulint		n);
+
+/** Returns a pointer to the heap top.
+@param[in]	heap		memory heap
+@return pointer to the heap top */
+UNIV_INLINE
+byte*
+mem_heap_get_heap_top(
+	mem_heap_t*	heap);
+
+/** Frees the space in a memory heap exceeding the pointer given.
+The pointer must have been acquired from mem_heap_get_heap_top.
+The first memory block of the heap is not freed.
+@param[in]	heap		heap from which to free
+@param[in]	old_top		pointer to old top of heap */
+UNIV_INLINE
+void
+mem_heap_free_heap_top(
+	mem_heap_t*	heap,
+	byte*		old_top);
+
+/** Empties a memory heap.
+The first memory block of the heap is not freed.
+@param[in]	heap		heap to empty */
+UNIV_INLINE
+void
+mem_heap_empty(
+	mem_heap_t*	heap);
+
+/** Returns a pointer to the topmost element in a memory heap.
+The size of the element must be given.
+@param[in]	heap	memory heap
+@param[in]	n	size of the topmost element
+@return pointer to the topmost element */
+UNIV_INLINE
+void*
+mem_heap_get_top(
+	mem_heap_t*	heap,
+	ulint		n);
+
+/*****************************************************************//**
+Frees the topmost element in a memory heap.
+The size of the element must be given. */
+UNIV_INLINE
+void
+mem_heap_free_top(
+/*==============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n);	/*!< in: size of the topmost element */
+/*****************************************************************//**
+Returns the space in bytes occupied by a memory heap. */
+UNIV_INLINE
+ulint
+mem_heap_get_size(
+/*==============*/
+	mem_heap_t*	heap);		/*!< in: heap */
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string.
+@return own: a copy of the string, must be deallocated with ut_free */
+UNIV_INLINE
+char*
+mem_strdup(
+/*=======*/
+	const char*	str);	/*!< in: string to be copied */
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string.
+@return own: a copy of the string, must be deallocated with ut_free */
+UNIV_INLINE
+char*
+mem_strdupl(
+/*========*/
+	const char*	str,	/*!< in: string to be copied */
+	ulint		len);	/*!< in: length of str, in bytes */
+
+/** Duplicate a block of data, allocated from a memory heap.
+@param[in]	heap	memory heap where string is allocated
+@param[in]	data	block of data to be copied
+@param[in]	len	length of data, in bytes
+@return own: a copy of data */
+inline
+void*
+mem_heap_dup(mem_heap_t* heap, const void* data, size_t len)
+{
+	ut_ad(data || !len);
+	return UNIV_LIKELY(data != NULL)
+		? memcpy(mem_heap_alloc(heap, len), data, len)
+		: NULL;
+}
+
+/** Duplicate a NUL-terminated string, allocated from a memory heap.
+@param[in]	heap	memory heap where string is allocated
+@param[in]	str	string to be copied
+@return own: a copy of the string */
+inline
+char*
+mem_heap_strdup(mem_heap_t* heap, const char* str)
+{
+	return(static_cast<char*>(mem_heap_dup(heap, str, strlen(str) + 1)));
+}
+
+/** Duplicate a string, allocated from a memory heap.
+@param[in]	heap	memory heap where string is allocated
+@param[in]	str	string to be copied
+@param[in]	len	length of str, in bytes
+@return own: a NUL-terminated copy of str */
+inline
+char*
+mem_heap_strdupl(mem_heap_t* heap, const char* str, size_t len)
+{
+	char*	s = static_cast<char*>(mem_heap_alloc(heap, len + 1));
+	s[len] = 0;
+	return(static_cast<char*>(memcpy(s, str, len)));
+}
+
+/**********************************************************************//**
+Concatenate two strings and return the result, using a memory heap.
+@return own: the result */
+char*
+mem_heap_strcat(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap where string is allocated */
+	const char*	s1,	/*!< in: string 1 */
+	const char*	s2);	/*!< in: string 2 */
+
+/****************************************************************//**
+A simple sprintf replacement that dynamically allocates the space for the
+formatted string from the given heap. This supports a very limited set of
+the printf syntax: types 's' and 'u' and length modifier 'l' (which is
+required for the 'u' type).
+@return heap-allocated formatted string */
+char*
+mem_heap_printf(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	const char*	format,	/*!< in: format string */
+	...) MY_ATTRIBUTE ((format (printf, 2, 3)));
+
+#ifdef UNIV_DEBUG
+/** Validates the contents of a memory heap.
+Asserts that the memory heap is consistent
+@param[in]	heap	Memory heap to validate */
+void
+mem_heap_validate(
+	const mem_heap_t*	heap);
+
+#endif /* UNIV_DEBUG */
+
+/*#######################################################################*/
+
+/** The info structure stored at the beginning of a heap block */
+struct mem_block_info_t {
+#ifdef UNIV_DEBUG
+	char	file_name[8];/* file name where the mem heap was created */
+	unsigned line;	/*!< line number where the mem heap was created */
+#endif /* UNIV_DEBUG */
+	UT_LIST_BASE_NODE_T(mem_block_t) base; /* In the first block in the
+			the list this is the base node of the list of blocks;
+			in subsequent blocks this is undefined */
+	UT_LIST_NODE_T(mem_block_t) list; /* This contains pointers to next
+			and prev in the list. The first block allocated
+			to the heap is also the first block in this list,
+			though it also contains the base node of the list. */
+	ulint	len;	/*!< physical length of this block in bytes */
+	ulint	total_size; /*!< physical length in bytes of all blocks
+			in the heap. This is defined only in the base
+			node and is set to ULINT_UNDEFINED in others. */
+	ulint	type;	/*!< type of heap: MEM_HEAP_DYNAMIC, or
+			MEM_HEAP_BUF possibly ORed to MEM_HEAP_BTR_SEARCH */
+	ulint	free;	/*!< offset in bytes of the first free position for
+			user data in the block */
+	ulint	start;	/*!< the value of the struct field 'free' at the
+			creation of the block */
+
+	void*	free_block;
+			/* if the MEM_HEAP_BTR_SEARCH bit is set in type,
+			and this is the heap root, this can contain an
+			allocated buffer frame, which can be appended as a
+			free block to the heap, if we need more space;
+			otherwise, this is NULL */
+	void*	buf_block;
+			/* if this block has been allocated from the buffer
+			pool, this contains the buf_block_t handle;
+			otherwise, this is NULL */
+};
+
+/* Header size for a memory heap block */
+#define MEM_BLOCK_HEADER_SIZE	UT_CALC_ALIGN(sizeof(mem_block_info_t),\
+					      UNIV_MEM_ALIGNMENT)
+
+#include "mem0mem.inl"
+#endif
diff --git a/storage/innobase/include/mem0mem.inl b/storage/innobase/include/mem0mem.inl
new file mode 100644
index 00000000..9906daf3
--- /dev/null
+++ b/storage/innobase/include/mem0mem.inl
@@ -0,0 +1,468 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/mem0mem.ic
+The memory management
+
+Created 6/8/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "ut0new.h"
+
+#ifdef UNIV_DEBUG
+# define mem_heap_create_block(heap, n, type, file_name, line)		\
+	mem_heap_create_block_func(heap, n, file_name, line, type)
+# define mem_heap_create_at(N, file_name, line)				\
+	mem_heap_create_func(N, file_name, line, MEM_HEAP_DYNAMIC)
+#else /* UNIV_DEBUG */
+# define mem_heap_create_block(heap, n, type, file_name, line)		\
+	mem_heap_create_block_func(heap, n, type)
+# define mem_heap_create_at(N, file_name, line)				\
+	mem_heap_create_func(N, MEM_HEAP_DYNAMIC)
+#endif /* UNIV_DEBUG */
+/***************************************************************//**
+Creates a memory heap block where data can be allocated.
+@return own: memory heap block, NULL if did not succeed (only possible
+for MEM_HEAP_BTR_SEARCH type heaps) */
+mem_block_t*
+mem_heap_create_block_func(
+/*=======================*/
+	mem_heap_t*	heap,	/*!< in: memory heap or NULL if first block
+				should be created */
+	ulint		n,	/*!< in: number of bytes needed for user data */
+#ifdef UNIV_DEBUG
+	const char*	file_name,/*!< in: file name where created */
+	unsigned	line,	/*!< in: line where created */
+#endif /* UNIV_DEBUG */
+	ulint		type);	/*!< in: type of heap: MEM_HEAP_DYNAMIC or
+				MEM_HEAP_BUFFER */
+
+/******************************************************************//**
+Frees a block from a memory heap. */
+void
+mem_heap_block_free(
+/*================*/
+	mem_heap_t*	heap,	/*!< in: heap */
+	mem_block_t*	block);	/*!< in: block to free */
+
+/******************************************************************//**
+Frees the free_block field from a memory heap. */
+void
+mem_heap_free_block_free(
+/*=====================*/
+	mem_heap_t*	heap);	/*!< in: heap */
+
+/***************************************************************//**
+Adds a new block to a memory heap.
+@param[in]	heap	memory heap
+@param[in]	n	number of bytes needed
+@return created block, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+mem_block_t*
+mem_heap_add_block(
+	mem_heap_t*	heap,
+	ulint		n);
+
+UNIV_INLINE
+void
+mem_block_set_len(mem_block_t* block, ulint len)
+{
+	ut_ad(len > 0);
+
+	block->len = len;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_len(mem_block_t* block)
+{
+	return(block->len);
+}
+
+UNIV_INLINE
+void
+mem_block_set_type(mem_block_t* block, ulint type)
+{
+	ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER)
+	      || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH));
+
+	block->type = type;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_type(mem_block_t* block)
+{
+	return(block->type);
+}
+
+UNIV_INLINE
+void
+mem_block_set_free(mem_block_t* block, ulint free)
+{
+	ut_ad(free > 0);
+	ut_ad(free <= mem_block_get_len(block));
+
+	block->free = free;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_free(mem_block_t* block)
+{
+	return(block->free);
+}
+
+UNIV_INLINE
+void
+mem_block_set_start(mem_block_t* block, ulint start)
+{
+	ut_ad(start > 0);
+
+	block->start = start;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_start(mem_block_t* block)
+{
+	return(block->start);
+}
+
+/** Allocates and zero-fills n bytes of memory from a memory heap.
+@param[in]	heap	memory heap
+@param[in]	n	number of bytes; if the heap is allowed to grow into
+the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF
+@return allocated, zero-filled storage */
+UNIV_INLINE
+void*
+mem_heap_zalloc(
+	mem_heap_t*	heap,
+	ulint		n)
+{
+	ut_ad(heap);
+	ut_ad(!(heap->type & MEM_HEAP_BTR_SEARCH));
+	return(memset(mem_heap_alloc(heap, n), 0, n));
+}
+
+/** Allocates n bytes of memory from a memory heap.
+@param[in]	heap	memory heap
+@param[in]	n	number of bytes; if the heap is allowed to grow into
+the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF
+@return allocated storage, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+void*
+mem_heap_alloc(
+	mem_heap_t*	heap,
+	ulint		n)
+{
+	mem_block_t*	block;
+	byte*		buf;
+	ulint		free;
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	n += REDZONE_SIZE;
+
+	ut_ad(!(block->type & MEM_HEAP_BUFFER) || (n <= MEM_MAX_ALLOC_IN_BUF));
+
+	/* Check if there is enough space in block. If not, create a new
+	block to the heap */
+
+	if (mem_block_get_len(block)
+	    < mem_block_get_free(block) + MEM_SPACE_NEEDED(n)) {
+
+		block = mem_heap_add_block(heap, n);
+
+		if (block == NULL) {
+
+			return(NULL);
+		}
+	}
+
+	free = mem_block_get_free(block);
+
+	buf = (byte*) block + free;
+
+	mem_block_set_free(block, free + MEM_SPACE_NEEDED(n));
+
+	buf = buf + REDZONE_SIZE;
+	MEM_MAKE_ADDRESSABLE(buf, n - REDZONE_SIZE);
+	return(buf);
+}
+
+/** Returns a pointer to the heap top.
+@param[in]	heap	memory heap
+@return pointer to the heap top */
+UNIV_INLINE
+byte*
+mem_heap_get_heap_top(
+	mem_heap_t*	heap)
+{
+	mem_block_t*	block;
+	byte*		buf;
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	buf = (byte*) block + mem_block_get_free(block);
+
+	return(buf);
+}
+
+/** Frees the space in a memory heap exceeding the pointer given.
+The pointer must have been acquired from mem_heap_get_heap_top.
+The first memory block of the heap is not freed.
+@param[in]	heap		heap from which to free
+@param[in]	old_top		pointer to old top of heap */
+UNIV_INLINE
+void
+mem_heap_free_heap_top(
+	mem_heap_t*	heap,
+	byte*		old_top)
+{
+	mem_block_t*	block;
+	mem_block_t*	prev_block;
+
+	ut_d(mem_heap_validate(heap));
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	while (block != NULL) {
+		if (((byte*) block + mem_block_get_free(block) >= old_top)
+		    && ((byte*) block <= old_top)) {
+			/* Found the right block */
+
+			break;
+		}
+
+		/* Store prev_block value before freeing the current block
+		(the current block will be erased in freeing) */
+
+		prev_block = UT_LIST_GET_PREV(list, block);
+
+		mem_heap_block_free(heap, block);
+
+		block = prev_block;
+	}
+
+	ut_ad(block);
+
+	/* Set the free field of block */
+	mem_block_set_free(block,
+			   ulint(old_top - reinterpret_cast<byte*>(block)));
+
+	ut_ad(mem_block_get_start(block) <= mem_block_get_free(block));
+	MEM_NOACCESS(old_top, (byte*) block + block->len - old_top);
+
+	/* If free == start, we may free the block if it is not the first
+	one */
+
+	if ((heap != block) && (mem_block_get_free(block)
+				== mem_block_get_start(block))) {
+		mem_heap_block_free(heap, block);
+	}
+}
+
+/** Empties a memory heap.
+The first memory block of the heap is not freed.
+@param[in]	heap	heap to empty */
+UNIV_INLINE
+void
+mem_heap_empty(
+	mem_heap_t*	heap)
+{
+	mem_heap_free_heap_top(heap, (byte*) heap + mem_block_get_start(heap));
+
+	if (heap->free_block) {
+		mem_heap_free_block_free(heap);
+	}
+}
+
+/** Returns a pointer to the topmost element in a memory heap.
+The size of the element must be given.
+@param[in]	heap	memory heap
+@param[in]	n	size of the topmost element
+@return pointer to the topmost element */
+UNIV_INLINE
+void*
+mem_heap_get_top(
+	mem_heap_t*	heap,
+	ulint		n)
+{
+	mem_block_t*	block;
+	byte*		buf;
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	buf = (byte*) block + mem_block_get_free(block) - MEM_SPACE_NEEDED(n);
+
+	return((void*) buf);
+}
+
+/*****************************************************************//**
+Frees the topmost element in a memory heap. The size of the element must be
+given. */
+UNIV_INLINE
+void
+mem_heap_free_top(
+/*==============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n)	/*!< in: size of the topmost element */
+{
+	mem_block_t*	block;
+
+	n += REDZONE_SIZE;
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	/* Subtract the free field of block */
+	mem_block_set_free(block, mem_block_get_free(block)
+			   - MEM_SPACE_NEEDED(n));
+
+	/* If free == start, we may free the block if it is not the first
+	one */
+
+	if ((heap != block) && (mem_block_get_free(block)
+				== mem_block_get_start(block))) {
+		mem_heap_block_free(heap, block);
+	} else {
+		MEM_NOACCESS((byte*) block + mem_block_get_free(block), n);
+	}
+}
+
+/** Creates a memory heap.
+NOTE: Use the corresponding macros instead of this function.
+A single user buffer of 'size' will fit in the block.
+0 creates a default size block.
+@param[in]	size		Desired start block size.
+@param[in]	file_name	File name where created
+@param[in]	line		Line where created
+@param[in]	type		Heap type
+@return own: memory heap, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+mem_heap_t*
+mem_heap_create_func(
+	ulint		size,
+#ifdef UNIV_DEBUG
+	const char*	file_name,
+	unsigned	line,
+#endif /* UNIV_DEBUG */
+	ulint		type)
+{
+	mem_block_t*   block;
+
+	if (!size) {
+		size = MEM_BLOCK_START_SIZE;
+	}
+
+	block = mem_heap_create_block(NULL, size, type, file_name, line);
+
+	if (block == NULL) {
+
+		return(NULL);
+	}
+
+	/* The first block should not be in buffer pool,
+	because it might be relocated to resize buffer pool. */
+	ut_ad(block->buf_block == NULL);
+
+	UT_LIST_INIT(block->base, &mem_block_t::list);
+
+	/* Add the created block itself as the first block in the list */
+	UT_LIST_ADD_FIRST(block->base, block);
+
+	return(block);
+}
+
+/** Frees the space occupied by a memory heap.
+NOTE: Use the corresponding macro instead of this function.
+@param[in]	heap	Heap to be freed */
+UNIV_INLINE
+void
+mem_heap_free(
+	mem_heap_t*	heap)
+{
+	mem_block_t*	block;
+	mem_block_t*	prev_block;
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	if (heap->free_block) {
+		mem_heap_free_block_free(heap);
+	}
+
+	while (block != NULL) {
+		/* Store the contents of info before freeing current block
+		(it is erased in freeing) */
+
+		prev_block = UT_LIST_GET_PREV(list, block);
+
+		mem_heap_block_free(heap, block);
+
+		block = prev_block;
+	}
+}
+
+/*****************************************************************//**
+Returns the space in bytes occupied by a memory heap. */
+UNIV_INLINE
+ulint
+mem_heap_get_size(
+/*==============*/
+	mem_heap_t*	heap)	/*!< in: heap */
+{
+	ulint size = heap->total_size;
+
+	if (heap->free_block) {
+		size += srv_page_size;
+	}
+
+	return(size);
+}
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string.
+@return own: a copy of the string, must be deallocated with ut_free */
+UNIV_INLINE
+char*
+mem_strdup(
+/*=======*/
+	const char*	str)	/*!< in: string to be copied */
+{
+	ulint	len = strlen(str) + 1;
+	return(static_cast<char*>(memcpy(ut_malloc_nokey(len), str, len)));
+}
+
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string.
+@return own: a copy of the string, must be deallocated with ut_free */
+UNIV_INLINE
+char*
+mem_strdupl(
+/*========*/
+	const char*	str,	/*!< in: string to be copied */
+	ulint		len)	/*!< in: length of str, in bytes */
+{
+	char*	s = static_cast<char*>(ut_malloc_nokey(len + 1));
+	s[len] = 0;
+	return(static_cast<char*>(memcpy(s, str, len)));
+}
diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h
new file mode 100644
index 00000000..e2419309
--- /dev/null
+++ b/storage/innobase/include/mtr0log.h
@@ -0,0 +1,637 @@
+/*****************************************************************************
+
+Copyright (c) 2019, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**
+@file include/mtr0log.h
+Mini-transaction log record encoding and decoding
+*******************************************************/
+
+#pragma once
+#include "mtr0mtr.h"
+
+/** The smallest invalid page identifier for persistent tablespaces */
+constexpr page_id_t end_page_id{SRV_SPACE_ID_UPPER_BOUND, 0};
+
+/** The minimum 2-byte integer (0b10xxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_2BYTE= 1 << 7;
+/** The minimum 3-byte integer (0b110xxxxx xxxxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_3BYTE= MIN_2BYTE + (1 << 14);
+/** The minimum 4-byte integer (0b1110xxxx xxxxxxxx xxxxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_4BYTE= MIN_3BYTE + (1 << 21);
+/** Minimum 5-byte integer (0b11110000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_5BYTE= MIN_4BYTE + (1 << 28);
+
+/** Error from mlog_decode_varint() */
+constexpr uint32_t MLOG_DECODE_ERROR= ~0U;
+
+/** Decode the length of a variable-length encoded integer.
+@param first  first byte of the encoded integer
+@return the length, in bytes */
+inline uint8_t mlog_decode_varint_length(byte first)
+{
+  uint8_t len= 1;
+  for (; first & 0x80; len++, first= static_cast<uint8_t>(first << 1));
+  return len;
+}
+
+/** Decode an integer in a redo log record.
+@param log    redo log record buffer
+@return the decoded integer
+@retval MLOG_DECODE_ERROR on error */
+template<typename byte_pointer>
+inline uint32_t mlog_decode_varint(const byte_pointer log)
+{
+  uint32_t i= *log;
+  if (i < MIN_2BYTE)
+    return i;
+  if (i < 0xc0)
+    return MIN_2BYTE + ((i & ~0x80) << 8 | log[1]);
+  if (i < 0xe0)
+    return MIN_3BYTE + ((i & ~0xc0) << 16 | uint32_t{log[1]} << 8 | log[2]);
+  if (i < 0xf0)
+    return MIN_4BYTE + ((i & ~0xe0) << 24 | uint32_t{log[1]} << 16 |
+                        uint32_t{log[2]} << 8 | log[3]);
+  if (i == 0xf0)
+  {
+    i= uint32_t{log[1]} << 24 | uint32_t{log[2]} << 16 |
+      uint32_t{log[3]} << 8 | log[4];
+    if (i <= ~MIN_5BYTE)
+      return MIN_5BYTE + i;
+  }
+  return MLOG_DECODE_ERROR;
+}
+
+/** Encode an integer in a redo log record.
+@param log  redo log record buffer
+@param i    the integer to encode
+@return end of the encoded integer */
+inline byte *mlog_encode_varint(byte *log, size_t i)
+{
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 4 and 5 need this here */
+#endif
+  if (i < MIN_2BYTE)
+  {
+  }
+  else if (i < MIN_3BYTE)
+  {
+    i-= MIN_2BYTE;
+    static_assert(MIN_3BYTE - MIN_2BYTE == 1 << 14, "compatibility");
+    *log++= 0x80 | static_cast<byte>(i >> 8);
+  }
+  else if (i < MIN_4BYTE)
+  {
+    i-= MIN_3BYTE;
+    static_assert(MIN_4BYTE - MIN_3BYTE == 1 << 21, "compatibility");
+    *log++= 0xc0 | static_cast<byte>(i >> 16);
+    goto last2;
+  }
+  else if (i < MIN_5BYTE)
+  {
+    i-= MIN_4BYTE;
+    static_assert(MIN_5BYTE - MIN_4BYTE == 1 << 28, "compatibility");
+    *log++= 0xe0 | static_cast<byte>(i >> 24);
+    goto last3;
+  }
+  else
+  {
+    ut_ad(i < MLOG_DECODE_ERROR);
+    i-= MIN_5BYTE;
+    *log++= 0xf0;
+    *log++= static_cast<byte>(i >> 24);
+last3:
+    *log++= static_cast<byte>(i >> 16);
+last2:
+    *log++= static_cast<byte>(i >> 8);
+  }
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+  *log++= static_cast<byte>(i);
+  return log;
+}
+
+/** Determine the length of a log record.
+@param log  start of log record
+@param end  end of the log record buffer
+@return the length of the record, in bytes
+@retval 0                 if the log extends past the end
+@retval MLOG_DECODE_ERROR if the record is corrupted */
+inline uint32_t mlog_decode_len(const byte *log, const byte *end)
+{
+  ut_ad(log < end);
+  uint32_t i= *log;
+  if (!i)
+    return 0; /* end of mini-transaction */
+  if (~i & 15)
+    return (i & 15) + 1; /* 1..16 bytes */
+  if (UNIV_UNLIKELY(++log == end))
+    return 0; /* end of buffer */
+  i= *log;
+  if (UNIV_LIKELY(i < MIN_2BYTE)) /* 1 additional length byte: 16..143 bytes */
+    return 16 + i;
+  if (i < 0xc0) /* 2 additional length bytes: 144..16,527 bytes */
+  {
+    if (UNIV_UNLIKELY(log + 1 == end))
+      return 0; /* end of buffer */
+    return 16 + MIN_2BYTE + ((i & ~0xc0) << 8 | log[1]);
+  }
+  if (i < 0xe0) /* 3 additional length bytes: 16528..1065103 bytes */
+  {
+    if (UNIV_UNLIKELY(log + 2 == end))
+      return 0; /* end of buffer */
+    return 16 + MIN_3BYTE + ((i & ~0xe0) << 16 |
+                             static_cast<uint32_t>(log[1]) << 8 | log[2]);
+  }
+  /* 1,065,103 bytes per log record ought to be enough for everyone */
+  return MLOG_DECODE_ERROR;
+}
+
+/** Write 1, 2, 4, or 8 bytes to a file page.
+@param[in]      block   file page
+@param[in,out]  ptr     pointer in file page
+@param[in]      val     value to write
+@tparam l       number of bytes to write
+@tparam w       write request type
+@tparam V       type of val
+@return whether any log was written */
+template<unsigned l,mtr_t::write_type w,typename V>
+inline bool mtr_t::write(const buf_block_t &block, void *ptr, V val)
+{
+  ut_ad(ut_align_down(ptr, srv_page_size) == block.page.frame);
+  static_assert(l == 1 || l == 2 || l == 4 || l == 8, "wrong length");
+  byte buf[l];
+
+  switch (l) {
+  case 1:
+    ut_ad(val == static_cast<byte>(val));
+    buf[0]= static_cast<byte>(val);
+    break;
+  case 2:
+    ut_ad(val == static_cast<uint16_t>(val));
+    mach_write_to_2(buf, static_cast<uint16_t>(val));
+    break;
+  case 4:
+    ut_ad(val == static_cast<uint32_t>(val));
+    mach_write_to_4(buf, static_cast<uint32_t>(val));
+    break;
+  case 8:
+    mach_write_to_8(buf, val);
+    break;
+  }
+  byte *p= static_cast<byte*>(ptr);
+  const byte *const end= p + l;
+  if (w != FORCED && is_logged())
+  {
+    const byte *b= buf;
+    while (*p++ == *b++)
+    {
+      if (p == end)
+      {
+        ut_ad(w == MAYBE_NOP);
+        return false;
+      }
+    }
+    p--;
+  }
+  ::memcpy(ptr, buf, l);
+  memcpy_low(block, static_cast<uint16_t>
+             (ut_align_offset(p, srv_page_size)), p, end - p);
+  return true;
+}
+
+/** Log an initialization of a string of bytes.
+@param[in]      b       buffer page
+@param[in]      ofs     byte offset from b->frame
+@param[in]      len     length of the data to write
+@param[in]      val     the data byte to write */
+inline void mtr_t::memset(const buf_block_t &b, ulint ofs, ulint len, byte val)
+{
+  ut_ad(len);
+  set_modified(b);
+  if (!is_logged())
+    return;
+
+  static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
+  size_t lenlen= (len < MIN_2BYTE ? 1 + 1 : len < MIN_3BYTE ? 2 + 1 : 3 + 1);
+  byte *l= log_write<MEMSET>(b.page.id(), &b.page, lenlen, true, ofs);
+  l= mlog_encode_varint(l, len);
+  *l++= val;
+  m_log.close(l);
+  m_last_offset= static_cast<uint16_t>(ofs + len);
+}
+
+/** Initialize a string of bytes.
+@param[in,out]  b       buffer page
+@param[in]      ofs     byte offset from block->frame
+@param[in]      len     length of the data to write
+@param[in]      val     the data byte to write */
+inline void mtr_t::memset(const buf_block_t *b, ulint ofs, ulint len, byte val)
+{
+  ut_ad(ofs <= ulint(srv_page_size));
+  ut_ad(ofs + len <= ulint(srv_page_size));
+  ::memset(ofs + b->page.frame, val, len);
+  memset(*b, ofs, len, val);
+}
+
+/** Log an initialization of a repeating string of bytes.
+@param[in]      b       buffer page
+@param[in]      ofs     byte offset from b->frame
+@param[in]      len     length of the data to write, in bytes
+@param[in]      str     the string to write
+@param[in]      size    size of str, in bytes */
+inline void mtr_t::memset(const buf_block_t &b, ulint ofs, size_t len,
+                          const void *str, size_t size)
+{
+  ut_ad(size);
+  ut_ad(len > size); /* use mtr_t::memcpy() for shorter writes */
+  set_modified(b);
+  if (!is_logged())
+    return;
+
+  static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
+  size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3);
+  byte *l= log_write<MEMSET>(b.page.id(), &b.page, lenlen + size, true, ofs);
+  l= mlog_encode_varint(l, len);
+  ::memcpy(l, str, size);
+  l+= size;
+  m_log.close(l);
+  m_last_offset= static_cast<uint16_t>(ofs + len);
+}
+
+/** Initialize a repeating string of bytes.
+@param[in,out]  b       buffer page
+@param[in]      ofs     byte offset from b->frame
+@param[in]      len     length of the data to write, in bytes
+@param[in]      str     the string to write
+@param[in]      size    size of str, in bytes */
+inline void mtr_t::memset(const buf_block_t *b, ulint ofs, size_t len,
+                          const void *str, size_t size)
+{
+  ut_ad(ofs <= ulint(srv_page_size));
+  ut_ad(ofs + len <= ulint(srv_page_size));
+  ut_ad(len > size); /* use mtr_t::memcpy() for shorter writes */
+  size_t s= 0;
+  while (s < len)
+  {
+    ::memcpy(ofs + s + b->page.frame, str, size);
+    s+= len;
+  }
+  ::memcpy(ofs + s + b->page.frame, str, len - s);
+  memset(*b, ofs, len, str, size);
+}
+
+/** Log a write of a byte string to a page.
+@param[in]      b       buffer page
+@param[in]      offset  byte offset from b->frame
+@param[in]      str     the data to write
+@param[in]      len     length of the data to write */
+inline void mtr_t::memcpy(const buf_block_t &b, ulint offset, ulint len)
+{
+  ut_ad(len);
+  ut_ad(offset <= ulint(srv_page_size));
+  ut_ad(offset + len <= ulint(srv_page_size));
+  memcpy_low(b, uint16_t(offset), &b.page.frame[offset], len);
+}
+
+/** Log a write of a byte string to a page.
+@param block   page
+@param offset  byte offset within page
+@param data    data to be written
+@param len     length of the data, in bytes */
+inline void mtr_t::memcpy_low(const buf_block_t &block, uint16_t offset,
+                              const void *data, size_t len)
+{
+  ut_ad(len);
+  set_modified(block);
+  if (!is_logged())
+    return;
+  if (len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5))
+  {
+    byte *end= log_write<WRITE>(block.page.id(), &block.page, len, true,
+                                offset);
+    ::memcpy(end, data, len);
+    m_log.close(end + len);
+  }
+  else
+  {
+    m_log.close(log_write<WRITE>(block.page.id(), &block.page, len, false,
+                                 offset));
+    m_log.push(static_cast<const byte*>(data), static_cast<uint32_t>(len));
+  }
+  m_last_offset= static_cast<uint16_t>(offset + len);
+}
+
+/** Log that a string of bytes was copied from the same page.
+@param[in]      b       buffer page
+@param[in]      d       destination offset within the page
+@param[in]      s       source offset within the page
+@param[in]      len     length of the data to copy */
+inline void mtr_t::memmove(const buf_block_t &b, ulint d, ulint s, ulint len)
+{
+  ut_ad(d >= 8);
+  ut_ad(s >= 8);
+  ut_ad(len);
+  ut_ad(s <= ulint(srv_page_size));
+  ut_ad(s + len <= ulint(srv_page_size));
+  ut_ad(s != d);
+  ut_ad(d <= ulint(srv_page_size));
+  ut_ad(d + len <= ulint(srv_page_size));
+
+  set_modified(b);
+  if (!is_logged())
+    return;
+  static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
+  size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3);
+  /* The source offset is encoded relative to the destination offset,
+  with the sign in the least significant bit. */
+  if (s > d)
+    s= (s - d) << 1;
+  else
+    s= (d - s) << 1 | 1;
+  /* The source offset 0 is not possible. */
+  s-= 1 << 1;
+  size_t slen= (s < MIN_2BYTE ? 1 : s < MIN_3BYTE ? 2 : 3);
+  byte *l= log_write<MEMMOVE>(b.page.id(), &b.page, lenlen + slen, true, d);
+  l= mlog_encode_varint(l, len);
+  l= mlog_encode_varint(l, s);
+  m_log.close(l);
+  m_last_offset= static_cast<uint16_t>(d + len);
+}
+
+/**
+Write a log record.
+@tparam type   redo log record type
+@param id     persistent page identifier
+@param bpage  buffer pool page, or nullptr
+@param len    number of additional bytes to write
+@param alloc  whether to allocate the additional bytes
+@param offset byte offset, or 0 if the record type does not allow one
+@return end of mini-transaction log, minus len */
+template<byte type>
+inline byte *mtr_t::log_write(const page_id_t id, const buf_page_t *bpage,
+                              size_t len, bool alloc, size_t offset)
+{
+  static_assert(!(type & 15) && type != RESERVED &&
+                type <= FILE_CHECKPOINT, "invalid type");
+  ut_ad(type >= FILE_CREATE || is_named_space(id.space()));
+  ut_ad(!bpage || bpage->id() == id);
+  ut_ad(id < end_page_id);
+  constexpr bool have_len= type != INIT_PAGE && type != FREE_PAGE;
+  constexpr bool have_offset= type == WRITE || type == MEMSET ||
+    type == MEMMOVE;
+  static_assert(!have_offset || have_len, "consistency");
+  ut_ad(have_len || len == 0);
+  ut_ad(have_len || !alloc);
+  ut_ad(have_offset || offset == 0);
+  ut_ad(offset + len <= srv_page_size);
+  static_assert(MIN_4BYTE >= UNIV_PAGE_SIZE_MAX, "consistency");
+  ut_ad(type == FREE_PAGE || type == OPTION || (type == EXTENDED && !bpage) ||
+        memo_contains_flagged(bpage, MTR_MEMO_MODIFY));
+  size_t max_len;
+  if (!have_len)
+    max_len= 1 + 5 + 5;
+  else if (!have_offset)
+    max_len= bpage && m_last == bpage
+      ? 1 + 3
+      : 1 + 3 + 5 + 5;
+  else if (bpage && m_last == bpage && m_last_offset <= offset)
+  {
+    /* Encode the offset relative from m_last_offset. */
+    offset-= m_last_offset;
+    max_len= 1 + 3 + 3;
+  }
+  else
+    max_len= 1 + 3 + 5 + 5 + 3;
+  byte *const log_ptr= m_log.open(alloc ? max_len + len : max_len);
+  byte *end= log_ptr + 1;
+  const byte same_page= max_len < 1 + 5 + 5 ? 0x80 : 0;
+  if (!same_page)
+  {
+    end= mlog_encode_varint(end, id.space());
+    end= mlog_encode_varint(end, id.page_no());
+    m_last= bpage;
+  }
+  if (have_offset)
+  {
+    byte* oend= mlog_encode_varint(end, offset);
+    if (oend + len > &log_ptr[16])
+    {
+      len+= oend - log_ptr - 15;
+      if (len >= MIN_3BYTE - 1)
+        len+= 2;
+      else if (len >= MIN_2BYTE)
+        len++;
+
+      *log_ptr= type | same_page;
+      end= mlog_encode_varint(log_ptr + 1, len);
+      if (!same_page)
+      {
+        end= mlog_encode_varint(end, id.space());
+        end= mlog_encode_varint(end, id.page_no());
+      }
+      end= mlog_encode_varint(end, offset);
+      return end;
+    }
+    else
+      end= oend;
+  }
+  else if (len >= 3 && end + len > &log_ptr[16])
+  {
+    len+= end - log_ptr - 15;
+    if (len >= MIN_3BYTE - 1)
+      len+= 2;
+    else if (len >= MIN_2BYTE)
+      len++;
+
+    end= log_ptr;
+    *end++= type | same_page;
+    end= mlog_encode_varint(end, len);
+
+    if (!same_page)
+    {
+      end= mlog_encode_varint(end, id.space());
+      end= mlog_encode_varint(end, id.page_no());
+    }
+    return end;
+  }
+
+  ut_ad(end + len >= &log_ptr[1] + !same_page);
+  ut_ad(end + len <= &log_ptr[16]);
+  ut_ad(end <= &log_ptr[max_len]);
+  *log_ptr= type | same_page | static_cast<byte>(end + len - log_ptr - 1);
+  ut_ad(*log_ptr & 15);
+  return end;
+}
+
+/** Write a byte string to a page.
+@param[in]      b       buffer page
+@param[in]      dest    destination within b.frame
+@param[in]      str     the data to write
+@param[in]      len     length of the data to write
+@tparam w       write request type */
+template<mtr_t::write_type w>
+inline void mtr_t::memcpy(const buf_block_t &b, void *dest, const void *str,
+                          ulint len)
+{
+  ut_ad(ut_align_down(dest, srv_page_size) == b.page.frame);
+  char *d= static_cast<char*>(dest);
+  const char *s= static_cast<const char*>(str);
+  if (w != FORCED && is_logged())
+  {
+    ut_ad(len);
+    const char *const end= d + len;
+    while (*d++ == *s++)
+    {
+      if (d == end)
+      {
+        ut_ad(w == MAYBE_NOP);
+        return;
+      }
+    }
+    s--;
+    d--;
+    len= static_cast<ulint>(end - d);
+  }
+  ::memcpy(d, s, len);
+  memcpy(b, ut_align_offset(d, srv_page_size), len);
+}
+
+/** Write an EXTENDED log record.
+@param block  buffer pool page
+@param type   extended record subtype; @see mrec_ext_t */
+inline void mtr_t::log_write_extended(const buf_block_t &block, byte type)
+{
+  set_modified(block);
+  if (!is_logged())
+    return;
+  byte *l= log_write<EXTENDED>(block.page.id(), &block.page, 1, true);
+  *l++= type;
+  m_log.close(l);
+  m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Write log for partly initializing a B-tree or R-tree page.
+@param block    B-tree or R-tree page
+@param comp     false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */
+inline void mtr_t::page_create(const buf_block_t &block, bool comp)
+{
+  static_assert(false == INIT_ROW_FORMAT_REDUNDANT, "encoding");
+  static_assert(true == INIT_ROW_FORMAT_DYNAMIC, "encoding");
+  log_write_extended(block, comp);
+}
+
+/** Write log for deleting a B-tree or R-tree record in ROW_FORMAT=REDUNDANT.
+@param block      B-tree or R-tree page
+@param prev_rec   byte offset of the predecessor of the record to delete,
+                  starting from PAGE_OLD_INFIMUM */
+inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec)
+{
+  ut_ad(!block.zip_size());
+  ut_ad(prev_rec < block.physical_size());
+  set_modified(block);
+  if (!is_logged())
+    return;
+  size_t len= (prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4);
+  byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, true);
+  ut_d(byte *end= l + len);
+  *l++= DELETE_ROW_FORMAT_REDUNDANT;
+  l= mlog_encode_varint(l, prev_rec);
+  ut_ad(end == l);
+  m_log.close(l);
+  m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Write log for deleting a COMPACT or DYNAMIC B-tree or R-tree record.
+@param block      B-tree or R-tree page
+@param prev_rec   byte offset of the predecessor of the record to delete,
+                  starting from PAGE_NEW_INFIMUM
+@param prev_rec   the predecessor of the record to delete
+@param hdr_size   record header size, excluding REC_N_NEW_EXTRA_BYTES
+@param data_size  data payload size, in bytes */
+inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec,
+                               size_t hdr_size, size_t data_size)
+{
+  ut_ad(!block.zip_size());
+  set_modified(block);
+  ut_ad(hdr_size < MIN_3BYTE);
+  ut_ad(prev_rec < block.physical_size());
+  ut_ad(data_size < block.physical_size());
+  if (!is_logged())
+    return;
+  size_t len= prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4;
+  len+= hdr_size < MIN_2BYTE ? 1 : 2;
+  len+= data_size < MIN_2BYTE ? 1 : data_size < MIN_3BYTE ? 2 : 3;
+  byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, true);
+  ut_d(byte *end= l + len);
+  *l++= DELETE_ROW_FORMAT_DYNAMIC;
+  l= mlog_encode_varint(l, prev_rec);
+  l= mlog_encode_varint(l, hdr_size);
+  l= mlog_encode_varint(l, data_size);
+  ut_ad(end == l);
+  m_log.close(l);
+  m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Write log for initializing an undo log page.
+@param block    undo page */
+inline void mtr_t::undo_create(const buf_block_t &block)
+{
+  log_write_extended(block, UNDO_INIT);
+}
+
+/** Write log for appending an undo log record.
+@param block    undo page
+@param data     record within the undo page
+@param len      length of the undo record, in bytes */
+inline void mtr_t::undo_append(const buf_block_t &block,
+                               const void *data, size_t len)
+{
+  ut_ad(len > 2);
+  set_modified(block);
+  if (!is_logged())
+    return;
+  const bool small= len + 1 < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5);
+  byte *end= log_write<EXTENDED>(block.page.id(), &block.page, len + 1, small);
+  if (UNIV_LIKELY(small))
+  {
+    *end++= UNDO_APPEND;
+    ::memcpy(end, data, len);
+    m_log.close(end + len);
+  }
+  else
+  {
+    m_log.close(end);
+    *m_log.push<byte*>(1)= UNDO_APPEND;
+    m_log.push(static_cast<const byte*>(data), static_cast<uint32_t>(len));
+  }
+  m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Trim the end of a tablespace.
+@param id       first page identifier that will not be in the file */
+inline void mtr_t::trim_pages(const page_id_t id)
+{
+  if (!is_logged())
+    return;
+  byte *l= log_write<EXTENDED>(id, nullptr, 1, true);
+  *l++= TRIM_PAGES;
+  m_log.close(l);
+  set_trim_pages();
+}
diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h
new file mode 100644
index 00000000..841cfab1
--- /dev/null
+++ b/storage/innobase/include/mtr0mtr.h
@@ -0,0 +1,780 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0mtr.h
+Mini-transaction buffer
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "fil0fil.h"
+#include "dyn0buf.h"
+#include "buf0buf.h"
+#include "small_vector.h"
+
+/** Start a mini-transaction. */
+#define mtr_start(m)		(m)->start()
+
+/** Commit a mini-transaction. */
+#define mtr_commit(m)		(m)->commit()
+
+/** Change the logging mode of a mini-transaction.
+@return	old mode */
+#define mtr_set_log_mode(m, d)	(m)->set_log_mode((d))
+
+#ifdef UNIV_PFS_RWLOCK
+# define mtr_s_lock_index(i,m)	(m)->s_lock(__FILE__, __LINE__, &(i)->lock)
+# define mtr_x_lock_index(i,m)	(m)->x_lock(__FILE__, __LINE__, &(i)->lock)
+# define mtr_sx_lock_index(i,m)	(m)->u_lock(__FILE__, __LINE__, &(i)->lock)
+#else
+# define mtr_s_lock_index(i,m)	(m)->s_lock(&(i)->lock)
+# define mtr_x_lock_index(i,m)	(m)->x_lock(&(i)->lock)
+# define mtr_sx_lock_index(i,m)	(m)->u_lock(&(i)->lock)
+#endif
+
+/** Mini-transaction memo stack slot. */
+struct mtr_memo_slot_t
+{
+  /** pointer to the object */
+  void *object;
+  /** type of the stored object */
+  mtr_memo_type_t type;
+
+  /** Release the object */
+  void release() const;
+};
+
+/** Mini-transaction handle and buffer */
+struct mtr_t {
+  mtr_t();
+  ~mtr_t();
+
+  /** Start a mini-transaction. */
+  void start();
+
+  /** Commit the mini-transaction. */
+  void commit();
+
+  /** Release latches of unmodified buffer pages.
+  @param begin   first slot to release
+  @param end     last slot to release, or get_savepoint() */
+  void rollback_to_savepoint(ulint begin, ulint end);
+
+  /** Release latches of unmodified buffer pages.
+  @param begin   first slot to release */
+  void rollback_to_savepoint(ulint begin)
+  { rollback_to_savepoint(begin, m_memo.size()); }
+
+  /** Release the last acquired buffer page latch. */
+  void release_last_page()
+  { auto s= m_memo.size(); rollback_to_savepoint(s - 1, s); }
+
+  /** Commit a mini-transaction that is shrinking a tablespace.
+  @param space   tablespace that is being shrunk */
+  ATTRIBUTE_COLD void commit_shrink(fil_space_t &space);
+
+  /** Commit a mini-transaction that is deleting or renaming a file.
+  @param space           tablespace that is being renamed or deleted
+  @param name            new file name (nullptr=the file will be deleted)
+  @return whether the operation succeeded */
+  ATTRIBUTE_COLD bool commit_file(fil_space_t &space, const char *name);
+
+  /** Commit a mini-transaction that did not modify any pages,
+  but generated some redo log on a higher level, such as
+  FILE_MODIFY records and an optional FILE_CHECKPOINT marker.
+  The caller must hold exclusive log_sys.latch.
+  This is to be used at log_checkpoint().
+  @param checkpoint_lsn   the log sequence number of a checkpoint, or 0
+  @return current LSN */
+  lsn_t commit_files(lsn_t checkpoint_lsn= 0);
+
+  /** @return mini-transaction savepoint (current size of m_memo) */
+  ulint get_savepoint() const
+  {
+    ut_ad(is_active());
+    return m_memo.size();
+  }
+
+  /** Get the block at a savepoint */
+  buf_block_t *at_savepoint(ulint savepoint) const
+  {
+    ut_ad(is_active());
+    const mtr_memo_slot_t &slot= m_memo[savepoint];
+    ut_ad(slot.type < MTR_MEMO_S_LOCK);
+    ut_ad(slot.object);
+    return static_cast<buf_block_t*>(slot.object);
+  }
+
+  /** Try to get a block at a savepoint.
+  @param savepoint the savepoint right before the block was acquired
+  @return the block at the savepoint
+  @retval nullptr  if no buffer block was registered at that savepoint */
+  buf_block_t *block_at_savepoint(ulint savepoint) const
+  {
+    ut_ad(is_active());
+    const mtr_memo_slot_t &slot= m_memo[savepoint];
+    return slot.type < MTR_MEMO_S_LOCK
+      ? static_cast<buf_block_t*>(slot.object)
+      : nullptr;
+  }
+
+  /** Retrieve a page that has already been latched.
+  @param id    page identifier
+  @param type  page latch type
+  @return block
+  @retval nullptr if the block had not been latched yet */
+  buf_block_t *get_already_latched(const page_id_t id, mtr_memo_type_t type)
+    const;
+
+  /** @return the logging mode */
+  mtr_log_t get_log_mode() const
+  {
+    static_assert(MTR_LOG_ALL == 0, "efficiency");
+    return static_cast<mtr_log_t>(m_log_mode);
+  }
+
+  /** @return whether log is to be written for changes */
+  bool is_logged() const
+  {
+    static_assert(MTR_LOG_ALL == 0, "efficiency");
+    static_assert(MTR_LOG_NONE & MTR_LOG_NO_REDO, "efficiency");
+    static_assert(!(MTR_LOG_NONE & MTR_LOG_SUB), "efficiency");
+    return !(m_log_mode & MTR_LOG_NONE);
+  }
+
+  /** Change the logging mode.
+  @param mode	 logging mode
+  @return	old mode */
+  mtr_log_t set_log_mode(mtr_log_t mode)
+  {
+    const mtr_log_t old_mode= get_log_mode();
+    m_log_mode= mode & 3;
+    return old_mode;
+  }
+
+  /** Set the log mode of a sub-minitransaction
+  @param mtr  parent mini-transaction */
+  void set_log_mode_sub(const mtr_t &mtr)
+  {
+    ut_ad(mtr.m_log_mode == MTR_LOG_ALL || mtr.m_log_mode == MTR_LOG_NO_REDO);
+    m_log_mode= mtr.m_log_mode | MTR_LOG_SUB;
+    static_assert((MTR_LOG_SUB | MTR_LOG_NO_REDO) == MTR_LOG_NO_REDO, "");
+  }
+
+  /** Check if we are holding a block latch in exclusive mode
+  @param block  buffer pool block to search for */
+  bool have_x_latch(const buf_block_t &block) const;
+
+  /** Check if we are holding a block latch in S or U mode
+  @param block  buffer pool block to search for */
+  bool have_u_or_x_latch(const buf_block_t &block) const;
+
+	/** Copy the tablespaces associated with the mini-transaction
+	(needed for generating FILE_MODIFY records)
+	@param[in]	mtr	mini-transaction that may modify
+	the same set of tablespaces as this one */
+	void set_spaces(const mtr_t& mtr)
+	{
+		ut_ad(!m_user_space_id);
+		ut_ad(!m_user_space);
+
+		ut_d(m_user_space_id = mtr.m_user_space_id);
+		m_user_space = mtr.m_user_space;
+	}
+
+	/** Set the tablespace associated with the mini-transaction
+	(needed for generating a FILE_MODIFY record)
+	@param[in]	space_id	user or system tablespace ID
+	@return	the tablespace */
+	fil_space_t* set_named_space_id(uint32_t space_id)
+	{
+		ut_ad(!m_user_space_id);
+		ut_d(m_user_space_id = space_id);
+		if (!space_id) {
+			return fil_system.sys_space;
+		} else {
+			ut_ad(m_user_space_id == space_id);
+			ut_ad(!m_user_space);
+			m_user_space = fil_space_get(space_id);
+			ut_ad(m_user_space);
+			return m_user_space;
+		}
+	}
+
+	/** Set the tablespace associated with the mini-transaction
+	(needed for generating a FILE_MODIFY record)
+	@param[in]	space	user or system tablespace */
+	void set_named_space(fil_space_t* space)
+	{
+		ut_ad(!m_user_space_id);
+		ut_d(m_user_space_id = space->id);
+		if (space->id) {
+			m_user_space = space;
+		}
+	}
+
+#ifdef UNIV_DEBUG
+	/** Check the tablespace associated with the mini-transaction
+	(needed for generating a FILE_MODIFY record)
+	@param[in]	space	tablespace
+	@return whether the mini-transaction is associated with the space */
+	bool is_named_space(uint32_t space) const;
+	/** Check the tablespace associated with the mini-transaction
+	(needed for generating a FILE_MODIFY record)
+	@param[in]	space	tablespace
+	@return whether the mini-transaction is associated with the space */
+	bool is_named_space(const fil_space_t* space) const;
+#endif /* UNIV_DEBUG */
+
+  /** Acquire a tablespace X-latch.
+  @param space_id   tablespace ID
+  @return the tablespace object (never NULL) */
+  fil_space_t *x_lock_space(uint32_t space_id);
+
+  /** Acquire a shared rw-latch. */
+  void s_lock(
+#ifdef UNIV_PFS_RWLOCK
+    const char *file, unsigned line,
+#endif
+    index_lock *lock)
+  {
+    lock->s_lock(SRW_LOCK_ARGS(file, line));
+    memo_push(lock, MTR_MEMO_S_LOCK);
+  }
+
+  /** Acquire an exclusive rw-latch. */
+  void x_lock(
+#ifdef UNIV_PFS_RWLOCK
+    const char *file, unsigned line,
+#endif
+    index_lock *lock)
+  {
+    lock->x_lock(SRW_LOCK_ARGS(file, line));
+    memo_push(lock, MTR_MEMO_X_LOCK);
+  }
+
+  /** Acquire an update latch. */
+  void u_lock(
+#ifdef UNIV_PFS_RWLOCK
+    const char *file, unsigned line,
+#endif
+    index_lock *lock)
+  {
+    lock->u_lock(SRW_LOCK_ARGS(file, line));
+    memo_push(lock, MTR_MEMO_SX_LOCK);
+  }
+
+  /** Acquire an exclusive tablespace latch.
+  @param space  tablespace */
+  void x_lock_space(fil_space_t *space);
+
+  /** Release an index latch. */
+  void release(const index_lock &lock) { release(&lock); }
+  /** Release a latch to an unmodified page. */
+  void release(const buf_block_t &block) { release(&block); }
+private:
+  /** Release an unmodified object. */
+  void release(const void *object);
+public:
+  /** Mark the given latched page as modified.
+  @param block   page that will be modified */
+  void set_modified(const buf_block_t &block);
+
+  /** Set the state to not-modified. This will not log the changes.
+  This is only used during redo log apply, to avoid logging the changes. */
+  void discard_modifications() { m_modifications= false; }
+
+  /** Get the LSN of commit().
+  @return the commit LSN
+  @retval 0 if the transaction only modified temporary tablespaces */
+  lsn_t commit_lsn() const { ut_ad(has_committed()); return m_commit_lsn; }
+
+  /** Note that we are inside the change buffer code. */
+  void enter_ibuf() { m_inside_ibuf= true; }
+
+  /** Note that we have exited from the change buffer code. */
+  void exit_ibuf() { m_inside_ibuf= false; }
+
+  /** @return true if we are inside the change buffer code */
+  bool is_inside_ibuf() const { return m_inside_ibuf; }
+
+  /** Note that some pages have been freed */
+  void set_trim_pages() { m_trim_pages= true; }
+
+  /** Latch a buffer pool block.
+  @param block    block to be latched
+  @param rw_latch RW_S_LATCH, RW_SX_LATCH, RW_X_LATCH, RW_NO_LATCH */
+  void page_lock(buf_block_t *block, ulint rw_latch);
+
+  /** Acquire a latch on a buffer-fixed buffer pool block.
+  @param savepoint   savepoint location of the buffer-fixed block
+  @param rw_latch    latch to acquire */
+  void upgrade_buffer_fix(ulint savepoint, rw_lock_type_t rw_latch);
+
+  /** Register a change to the page latch state. */
+  void lock_register(ulint savepoint, mtr_memo_type_t type)
+  {
+    mtr_memo_slot_t &slot= m_memo[savepoint];
+    ut_ad(slot.type <= MTR_MEMO_BUF_FIX);
+    ut_ad(type < MTR_MEMO_S_LOCK);
+    slot.type= type;
+  }
+
+  /** Upgrade U locks on a block to X */
+  void page_lock_upgrade(const buf_block_t &block);
+
+  /** Upgrade index U lock to X */
+  ATTRIBUTE_COLD void index_lock_upgrade();
+
+  /** Check if we are holding tablespace latch
+  @param space  tablespace to search for
+  @return whether space.latch is being held */
+  bool memo_contains(const fil_space_t& space) const
+    MY_ATTRIBUTE((warn_unused_result));
+#ifdef UNIV_DEBUG
+  /** Check if we are holding an rw-latch in this mini-transaction
+  @param lock   latch to search for
+  @param type   held latch type
+  @return whether (lock,type) is contained */
+  bool memo_contains(const index_lock &lock, mtr_memo_type_t type) const
+    MY_ATTRIBUTE((warn_unused_result));
+
+  /** Check if memo contains an index or buffer block latch.
+  @param object    object to search
+  @param flags     specify types of object latches
+  @return true if contains */
+  bool memo_contains_flagged(const void *object, ulint flags) const
+    MY_ATTRIBUTE((warn_unused_result, nonnull));
+
+  /** Check if memo contains the given page.
+  @param ptr   pointer to within page frame
+  @param flags types latch to look for
+  @return the block
+  @retval nullptr    if not found */
+  buf_block_t *memo_contains_page_flagged(const byte *ptr, ulint flags) const;
+
+  /** @return whether this mini-transaction modifies persistent data */
+  bool has_modifications() const { return m_modifications; }
+#endif /* UNIV_DEBUG */
+
+  /** Push a buffer page to an the memo.
+  @param block  buffer block
+  @param type	object type: MTR_MEMO_S_LOCK, ... */
+  void memo_push(buf_block_t *block, mtr_memo_type_t type)
+    __attribute__((nonnull))
+  {
+    ut_ad(is_active());
+    ut_ad(type <= MTR_MEMO_PAGE_SX_MODIFY);
+    ut_ad(block->page.buf_fix_count());
+    ut_ad(block->page.in_file());
+#ifdef UNIV_DEBUG
+    switch (type) {
+    case MTR_MEMO_PAGE_S_FIX:
+      ut_ad(block->page.lock.have_s());
+      break;
+    case MTR_MEMO_PAGE_X_FIX: case MTR_MEMO_PAGE_X_MODIFY:
+      ut_ad(block->page.lock.have_x());
+      break;
+    case MTR_MEMO_PAGE_SX_FIX: case MTR_MEMO_PAGE_SX_MODIFY:
+      ut_ad(block->page.lock.have_u_or_x());
+      break;
+    case MTR_MEMO_BUF_FIX:
+      break;
+    case MTR_MEMO_MODIFY:
+    case MTR_MEMO_S_LOCK: case MTR_MEMO_X_LOCK: case MTR_MEMO_SX_LOCK:
+    case MTR_MEMO_SPACE_X_LOCK:
+      ut_ad("invalid type" == 0);
+    }
+#endif
+    if (!(type & MTR_MEMO_MODIFY));
+    else if (block->page.id().space() >= SRV_TMP_SPACE_ID)
+    {
+      block->page.set_temp_modified();
+      type= mtr_memo_type_t(type & ~MTR_MEMO_MODIFY);
+    }
+    else
+    {
+      m_modifications= true;
+      if (!m_made_dirty)
+        /* If we are going to modify a previously clean persistent page,
+        we must set m_made_dirty, so that commit() will acquire
+        log_sys.flush_order_mutex and insert the block into
+        buf_pool.flush_list. */
+        m_made_dirty= block->page.oldest_modification() <= 1;
+    }
+    m_memo.emplace_back(mtr_memo_slot_t{block, type});
+  }
+
+  /** Push an index lock or tablespace latch to the memo.
+  @param object index lock or tablespace latch
+  @param type	object type: MTR_MEMO_S_LOCK, ... */
+  void memo_push(void *object, mtr_memo_type_t type) __attribute__((nonnull))
+  {
+    ut_ad(is_active());
+    ut_ad(type >= MTR_MEMO_S_LOCK);
+    m_memo.emplace_back(mtr_memo_slot_t{object, type});
+  }
+
+  /** @return the size of the log is empty */
+  size_t get_log_size() const { return m_log.size(); }
+  /** @return whether the log and memo are empty */
+  bool is_empty() const { return !get_savepoint() && !get_log_size(); }
+
+  /** Write an OPT_PAGE_CHECKSUM record. */
+  inline void page_checksum(const buf_page_t &bpage);
+
+  /** Write request types */
+  enum write_type
+  {
+    /** the page is guaranteed to always change */
+    NORMAL= 0,
+    /** optional: the page contents might not change */
+    MAYBE_NOP,
+    /** force a write, even if the page contents is not changing */
+    FORCED
+  };
+
+  /** Write 1, 2, 4, or 8 bytes to a file page.
+  @param[in]      block   file page
+  @param[in,out]  ptr     pointer in file page
+  @param[in]      val     value to write
+  @tparam l       number of bytes to write
+  @tparam w       write request type
+  @tparam V       type of val
+  @return whether any log was written */
+  template<unsigned l,write_type w= NORMAL,typename V>
+  inline bool write(const buf_block_t &block, void *ptr, V val)
+    MY_ATTRIBUTE((nonnull));
+
+  /** Log a write of a byte string to a page.
+  @param[in]      b       buffer page
+  @param[in]      ofs     byte offset from b->frame
+  @param[in]      len     length of the data to write */
+  inline void memcpy(const buf_block_t &b, ulint ofs, ulint len);
+
+  /** Write a byte string to a page.
+  @param[in,out]  b       buffer page
+  @param[in]      dest    destination within b.frame
+  @param[in]      str     the data to write
+  @param[in]      len     length of the data to write
+  @tparam w       write request type */
+  template<write_type w= NORMAL>
+  inline void memcpy(const buf_block_t &b, void *dest, const void *str,
+                     ulint len);
+
+  /** Log a write of a byte string to a ROW_FORMAT=COMPRESSED page.
+  @param[in]      b       ROW_FORMAT=COMPRESSED index page
+  @param[in]      offset  byte offset from b.zip.data
+  @param[in]      len     length of the data to write */
+  inline void zmemcpy(const buf_block_t &b, ulint offset, ulint len);
+
+  /** Write a byte string to a ROW_FORMAT=COMPRESSED page.
+  @param[in]      b       ROW_FORMAT=COMPRESSED index page
+  @param[in]      dest    destination within b.zip.data
+  @param[in]      str     the data to write
+  @param[in]      len     length of the data to write
+  @tparam w       write request type */
+  template<write_type w= NORMAL>
+  inline void zmemcpy(const buf_block_t &b, void *dest, const void *str,
+                      ulint len);
+
+  /** Log an initialization of a string of bytes.
+  @param[in]      b       buffer page
+  @param[in]      ofs     byte offset from b->frame
+  @param[in]      len     length of the data to write
+  @param[in]      val     the data byte to write */
+  inline void memset(const buf_block_t &b, ulint ofs, ulint len, byte val);
+
+  /** Initialize a string of bytes.
+  @param[in,out]        b       buffer page
+  @param[in]            ofs     byte offset from b->frame
+  @param[in]            len     length of the data to write
+  @param[in]            val     the data byte to write */
+  inline void memset(const buf_block_t *b, ulint ofs, ulint len, byte val);
+
+  /** Log an initialization of a repeating string of bytes.
+  @param[in]      b       buffer page
+  @param[in]      ofs     byte offset from b->frame
+  @param[in]      len     length of the data to write, in bytes
+  @param[in]      str     the string to write
+  @param[in]      size    size of str, in bytes */
+  inline void memset(const buf_block_t &b, ulint ofs, size_t len,
+                     const void *str, size_t size);
+
+  /** Initialize a repeating string of bytes.
+  @param[in,out]  b       buffer page
+  @param[in]      ofs     byte offset from b->frame
+  @param[in]      len     length of the data to write, in bytes
+  @param[in]      str     the string to write
+  @param[in]      size    size of str, in bytes */
+  inline void memset(const buf_block_t *b, ulint ofs, size_t len,
+                     const void *str, size_t size);
+
+  /** Log that a string of bytes was copied from the same page.
+  @param[in]      b       buffer page
+  @param[in]      d       destination offset within the page
+  @param[in]      s       source offset within the page
+  @param[in]      len     length of the data to copy */
+  inline void memmove(const buf_block_t &b, ulint d, ulint s, ulint len);
+
+  /** Initialize an entire page.
+  @param[in,out]        b       buffer page */
+  void init(buf_block_t *b);
+  /** Free a page.
+  @param space   tablespace
+  @param offset  offset of the page to be freed */
+  void free(const fil_space_t &space, uint32_t offset);
+  /** Write log for partly initializing a B-tree or R-tree page.
+  @param block    B-tree or R-tree page
+  @param comp     false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */
+  inline void page_create(const buf_block_t &block, bool comp);
+
+  /** Write log for inserting a B-tree or R-tree record in
+  ROW_FORMAT=REDUNDANT.
+  @param block      B-tree or R-tree page
+  @param reuse      false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+  @param prev_rec   byte offset of the predecessor of the record to insert,
+                    starting from PAGE_OLD_INFIMUM
+  @param info_bits  info_bits of the record
+  @param n_fields_s number of fields << 1 | rec_get_1byte_offs_flag()
+  @param hdr_c      number of common record header bytes with prev_rec
+  @param data_c     number of common data bytes with prev_rec
+  @param hdr        record header bytes to copy to the log
+  @param hdr_l      number of copied record header bytes
+  @param data       record payload bytes to copy to the log
+  @param data_l     number of copied record data bytes */
+  inline void page_insert(const buf_block_t &block, bool reuse,
+                          ulint prev_rec, byte info_bits,
+                          ulint n_fields_s, size_t hdr_c, size_t data_c,
+                          const byte *hdr, size_t hdr_l,
+                          const byte *data, size_t data_l);
+  /** Write log for inserting a B-tree or R-tree record in
+  ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC.
+  @param block       B-tree or R-tree page
+  @param reuse       false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+  @param prev_rec    byte offset of the predecessor of the record to insert,
+                     starting from PAGE_NEW_INFIMUM
+  @param info_status rec_get_info_and_status_bits()
+  @param shift       unless !reuse: number of bytes the PAGE_FREE is moving
+  @param hdr_c       number of common record header bytes with prev_rec
+  @param data_c      number of common data bytes with prev_rec
+  @param hdr         record header bytes to copy to the log
+  @param hdr_l       number of copied record header bytes
+  @param data        record payload bytes to copy to the log
+  @param data_l      number of copied record data bytes */
+  inline void page_insert(const buf_block_t &block, bool reuse,
+                          ulint prev_rec, byte info_status,
+                          ssize_t shift, size_t hdr_c, size_t data_c,
+                          const byte *hdr, size_t hdr_l,
+                          const byte *data, size_t data_l);
+  /** Write log for deleting a B-tree or R-tree record in ROW_FORMAT=REDUNDANT.
+  @param block      B-tree or R-tree page
+  @param prev_rec   byte offset of the predecessor of the record to delete,
+                    starting from PAGE_OLD_INFIMUM */
+  inline void page_delete(const buf_block_t &block, ulint prev_rec);
+  /** Write log for deleting a COMPACT or DYNAMIC B-tree or R-tree record.
+  @param block      B-tree or R-tree page
+  @param prev_rec   byte offset of the predecessor of the record to delete,
+                    starting from PAGE_NEW_INFIMUM
+  @param hdr_size   record header size, excluding REC_N_NEW_EXTRA_BYTES
+  @param data_size  data payload size, in bytes */
+  inline void page_delete(const buf_block_t &block, ulint prev_rec,
+                          size_t hdr_size, size_t data_size);
+
+  /** Write log for initializing an undo log page.
+  @param block    undo page */
+  inline void undo_create(const buf_block_t &block);
+  /** Write log for appending an undo log record.
+  @param block    undo page
+  @param data     record within the undo page
+  @param len      length of the undo record, in bytes */
+  inline void undo_append(const buf_block_t &block,
+                          const void *data, size_t len);
+  /** Trim the end of a tablespace.
+  @param id       first page identifier that will not be in the file */
+  inline void trim_pages(const page_id_t id);
+
+  /** Write a log record about a file operation.
+  @param type           file operation
+  @param space_id       tablespace identifier
+  @param path           file path
+  @param new_path       new file path for type=FILE_RENAME */
+  inline void log_file_op(mfile_type_t type, uint32_t space_id,
+                          const char *path,
+                          const char *new_path= nullptr);
+
+  /** Add freed page numbers to freed_pages */
+  void add_freed_offset(fil_space_t *space, uint32_t page)
+  {
+    ut_ad(is_named_space(space));
+    if (!m_freed_pages)
+    {
+      m_freed_pages= new range_set();
+      ut_ad(!m_freed_space);
+      m_freed_space= space;
+    }
+    else
+      ut_ad(m_freed_space == space);
+    m_freed_pages->add_value(page);
+  }
+
+  /** Determine the added buffer fix count of a block.
+  @param block block to be checked
+  @return number of buffer count added by this mtr */
+  uint32_t get_fix_count(const buf_block_t *block) const;
+
+  /** Note that log_sys.latch is no longer being held exclusively. */
+  void flag_wr_unlock() noexcept { ut_ad(m_latch_ex); m_latch_ex= false; }
+
+  /** type of page flushing is needed during commit() */
+  enum page_flush_ahead
+  {
+    /** no need to trigger page cleaner */
+    PAGE_FLUSH_NO= 0,
+    /** asynchronous flushing is needed */
+    PAGE_FLUSH_ASYNC,
+    /** furious flushing is needed */
+    PAGE_FLUSH_SYNC
+  };
+
+private:
+  /** Handle any pages that were freed during the mini-transaction. */
+  void process_freed_pages();
+  /** Release modified pages when no log was written. */
+  void release_unlogged();
+
+  /** Log a write of a byte string to a page.
+  @param block   buffer page
+  @param offset  byte offset within page
+  @param data    data to be written
+  @param len     length of the data, in bytes */
+  inline void memcpy_low(const buf_block_t &block, uint16_t offset,
+                         const void *data, size_t len);
+  /**
+  Write a log record.
+  @tparam type  redo log record type
+  @param id     persistent page identifier
+  @param bpage  buffer pool page, or nullptr
+  @param len    number of additional bytes to write
+  @param alloc  whether to allocate the additional bytes
+  @param offset byte offset, or 0 if the record type does not allow one
+  @return end of mini-transaction log, minus len */
+  template<byte type>
+  inline byte *log_write(const page_id_t id, const buf_page_t *bpage,
+                         size_t len= 0, bool alloc= false, size_t offset= 0);
+
+  /** Write an EXTENDED log record.
+  @param block  buffer pool page
+  @param type   extended record subtype; @see mrec_ext_t */
+  inline void log_write_extended(const buf_block_t &block, byte type);
+
+  /** Write a FILE_MODIFY record when a non-predefined persistent
+  tablespace was modified for the first time since fil_names_clear(). */
+  ATTRIBUTE_NOINLINE ATTRIBUTE_COLD void name_write();
+
+  /** Encrypt the log */
+  ATTRIBUTE_NOINLINE void encrypt();
+
+  /** Append the redo log records to the redo log buffer.
+  @return {start_lsn,flush_ahead} */
+  std::pair<lsn_t,page_flush_ahead> do_write();
+
+  /** Append the redo log records to the redo log buffer.
+  @param len   number of bytes to write
+  @return {start_lsn,flush_ahead} */
+  std::pair<lsn_t,page_flush_ahead> finish_write(size_t len);
+
+  /** Release all latches. */
+  void release();
+  /** Release the resources */
+  inline void release_resources();
+
+#ifdef UNIV_DEBUG
+public:
+  /** @return whether the mini-transaction is active */
+  bool is_active() const
+  { ut_ad(!m_commit || m_start); return m_start && !m_commit; }
+  /** @return whether the mini-transaction has been committed */
+  bool has_committed() const { ut_ad(!m_commit || m_start); return m_commit; }
+  /** @return whether the mini-transaction is freeing an index tree */
+  bool is_freeing_tree() const { return m_freeing_tree; }
+  /** Notify that the mini-transaction is freeing an index tree */
+  void freeing_tree() { m_freeing_tree= true; }
+private:
+  /** whether start() has been called */
+  bool m_start= false;
+  /** whether commit() has been called */
+  bool m_commit= false;
+  /** whether freeing_tree() has been called */
+  bool m_freeing_tree= false;
+#endif
+private:
+  /** The page of the most recent m_log record written, or NULL */
+  const buf_page_t* m_last;
+  /** The current byte offset in m_last, or 0 */
+  uint16_t m_last_offset;
+
+  /** specifies which operations should be logged; default MTR_LOG_ALL */
+  uint16_t m_log_mode:2;
+
+  /** whether at least one persistent page was written to */
+  uint16_t m_modifications:1;
+
+  /** whether at least one previously clean buffer pool page was written to */
+  uint16_t m_made_dirty:1;
+
+  /** whether log_sys.latch is locked exclusively */
+  uint16_t m_latch_ex:1;
+
+  /** whether change buffer is latched; only needed in non-debug builds
+  to suppress some read-ahead operations, @see ibuf_inside() */
+  uint16_t m_inside_ibuf:1;
+
+  /** whether the pages has been trimmed */
+  uint16_t m_trim_pages:1;
+
+  /** CRC-32C of m_log */
+  uint32_t m_crc;
+
+#ifdef UNIV_DEBUG
+  /** Persistent user tablespace associated with the
+  mini-transaction, or 0 (TRX_SYS_SPACE) if none yet */
+  uint32_t m_user_space_id;
+#endif /* UNIV_DEBUG */
+
+  /** acquired dict_index_t::lock, fil_space_t::latch, buf_block_t */
+  small_vector<mtr_memo_slot_t, 16> m_memo;
+
+  /** mini-transaction log */
+  mtr_buf_t m_log;
+
+  /** user tablespace that is being modified by the mini-transaction */
+  fil_space_t* m_user_space;
+
+  /** LSN at commit time */
+  lsn_t m_commit_lsn;
+
+  /** tablespace where pages have been freed */
+  fil_space_t *m_freed_space= nullptr;
+  /** set of freed page ids */
+  range_set *m_freed_pages= nullptr;
+};
diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h
new file mode 100644
index 00000000..19db13a1
--- /dev/null
+++ b/storage/innobase/include/mtr0types.h
@@ -0,0 +1,347 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0types.h
+Mini-transaction buffer global types
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "buf0types.h"
+
+#include "ut0byte.h"
+
+struct mtr_t;
+
+/** Logging modes for a mini-transaction */
+enum mtr_log_t {
+	/** Default mode: log all operations modifying disk-based data */
+	MTR_LOG_ALL = 0,
+
+	/** Log no operations and dirty pages are not added to the flush list.
+	Set for attempting modification of a ROW_FORMAT=COMPRESSED page. */
+	MTR_LOG_NONE,
+
+	/** Log all operations, but do not write any OPT_PAGE_CHECKSUM
+	records because some of the modified pages were also modified
+	by another mini-transaction that did not write its log yet. */
+	MTR_LOG_SUB,
+
+	/** Don't generate REDO log but add dirty pages to flush list */
+	MTR_LOG_NO_REDO
+};
+
+/*
+A mini-transaction is a stream of records that is always terminated by
+a byte 0x00 or 0x01. The first byte of a mini-transaction record is
+never one of these bytes, but these bytes can occur within mini-transaction
+records.
+
+The first byte of the record would contain a record type, flags, and a
+part of length. The optional second byte of the record will contain
+more length. (Not needed for short records.)
+
+For example, because the length of an INIT_PAGE record is 3 to 11 bytes,
+the first byte will be 0x02 to 0x0a, indicating the number of subsequent bytes.
+
+Bit 7 of the first byte of a redo log record is the same_page flag.
+If same_page=1, the record is referring to the same page as the
+previous record. Records that do not refer to data pages but to file
+operations are identified by setting the same_page=1 in the very first
+record(s) of the mini-transaction. A mini-transaction record that
+carries same_page=0 must only be followed by page-oriented records.
+
+Bits 6..4 of the first byte of a redo log record identify the redo log
+type. The following record types refer to data pages:
+
+    FREE_PAGE (0): corresponds to MLOG_INIT_FREE_PAGE
+    INIT_PAGE (1): corresponds to MLOG_INIT_FILE_PAGE2
+    EXTENDED (2): extended record; followed by subtype code @see mrec_ext_t
+    WRITE (3): replaces MLOG_nBYTES, MLOG_WRITE_STRING, MLOG_ZIP_*
+    MEMSET (4): extends the 10.4 MLOG_MEMSET record
+    MEMMOVE (5): copy data within the page (avoids logging redundant data)
+    RESERVED (6): reserved for future use; a subtype code
+    (encoded immediately after the length) would be written
+    to reserve code space for further extensions
+    OPTION (7): optional record that may be ignored; a subtype @see mrec_opt
+    (encoded immediately after the length) would distinguish actual usage
+
+Bits 3..0 indicate the redo log record length, excluding the first
+byte, but including additional length bytes and any other bytes,
+such as the optional tablespace identifier and page number.
+Values 1..15 represent lengths of 1 to 15 bytes. The special value 0
+indicates that 1 to 3 length bytes will follow to encode the remaining
+length that exceeds 16 bytes.
+
+Additional length bytes if length>16: 0 to 3 bytes
+0xxxxxxx                   for 0 to 127 (total: 16 to 143 bytes)
+10xxxxxx xxxxxxxx          for 128 to 16511 (total: 144 to 16527)
+110xxxxx xxxxxxxx xxxxxxxx for 16512 to 2113663 (total: 16528 to 2113679)
+111xxxxx                   reserved (corrupted record, and file!)
+
+If same_page=0, the tablespace identifier and page number will use
+similar 1-to-5-byte variable-length encoding:
+0xxxxxxx                                     for 0 to 127
+10xxxxxx xxxxxxxx                            for 128 to 16,511
+110xxxxx xxxxxxxx xxxxxxxx                   for 16,512 to 2,113,663
+1110xxxx xxxxxxxx xxxxxxxx xxxxxxxx          for 2,113,664 to 270,549,119
+11110xxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx for 270,549,120 to 34,630,287,487
+11111xxx                                     reserved (corrupted record)
+Note: Some 5-byte values are reserved, because the tablespace identifier
+and page number can only be up to 4,294,967,295.
+
+If same_page=1 is set in a record that follows a same_page=0 record
+in a mini-transaction, the tablespace identifier and page number
+fields will be omitted.
+
+For FILE_ records (if same_page=1 for the first record
+of a mini-transaction), we will write a tablespace identifier and
+a page number (always 0) using the same 1-to-5-byte encoding.
+
+For FREE_PAGE or INIT_PAGE, if same_page=1, the record will be treated
+as corrupted (or reserved for future extension).  The type code must
+be followed by 1+1 to 5+5 bytes (to encode the tablespace identifier
+and page number). If the record length does not match the encoded
+lengths of the tablespace identifier and page number, the record will
+be treated as corrupted. This allows future expansion of the format.
+
+If there is a FREE_PAGE record in a mini-transaction, it must be the
+only record for that page in the mini-transaction. If there is an
+INIT_PAGE record for a page in a mini-transaction, it must be the
+first record for that page in the mini-transaction.
+
+An EXTENDED record must be followed by 1+1 to 5+5 bytes for the page
+identifier (unless the same_page flag is set) and a subtype; @see mrec_ext_t
+
+For WRITE, MEMSET, MEMMOVE, the next 1 to 3 bytes are the byte offset
+on the page, relative from the previous offset. If same_page=0, the
+"previous offset" is 0. If same_page=1, the "previous offset" is where
+the previous operation ended (FIL_PAGE_TYPE for INIT_PAGE).
+0xxxxxxx                                     for 0 to 127
+10xxxxxx xxxxxxxx                            for 128 to 16,511
+110xxxxx xxxxxxxx xxxxxxxx                   for 16,512 to 2,113,663
+111xxxxx                                     reserved (corrupted record)
+If the sum of the "previous offset" and the current offset exceeds the
+page size, the record is treated as corrupted. Negative relative offsets
+cannot be written. Instead, a record with same_page=0 can be written.
+
+For MEMSET and MEMMOVE, the target length will follow, encoded in 1 to
+3 bytes.  If the length+offset exceeds the page size, the record will
+be treated as corrupted.
+
+For MEMMOVE, the source offset will follow, encoded in 1 to 3 bytes,
+relative to the current offset. The offset 0 is not possible, and
+the sign bit is the least significant bit. That is,
++x is encoded as (x-1)<<1 (+1,+2,+3,... is 0,2,4,...) and
+-x is encoded as (x-1)<<1|1 (-1,-2,-3,... is 1,3,5,...).
+The source offset must be within the page size, or else the record
+will be treated as corrupted.
+
+For MEMSET or WRITE, the byte(s) to be written will follow. For
+MEMSET, it usually is a single byte, but it could also be a multi-byte
+string, which would be copied over and over until the target length is
+reached. The length of the remaining bytes is implied by the length
+bytes at the start of the record.
+
+For MEMMOVE, if any bytes follow, the record is treated as corrupted
+(future expansion).
+
+As mentioned at the start of this comment, the type byte 0 would be
+special, marking the end of a mini-transaction. We could use the
+corresponding value 0x80 (with same_page=1) for something special,
+such as a future extension when more type codes are needed, or for
+encoding rarely needed redo log records.
+
+Examples:
+
+INIT could be logged as 0x12 0x34 0x56, meaning "type code 1 (INIT), 2
+bytes to follow" and "tablespace ID 0x34", "page number 0x56".
+The first byte must be between 0x12 and 0x1a, and the total length of
+the record must match the lengths of the encoded tablespace ID and
+page number.
+
+WRITE could be logged as 0x36 0x40 0x57 0x60 0x12 0x34 0x56, meaning
+"type code 3 (WRITE), 6 bytes to follow" and "tablespace ID 0x40",
+"page number 0x57", "byte offset 0x60", data 0x34,0x56.
+
+A subsequent WRITE to the same page could be logged 0xb5 0x7f 0x23
+0x34 0x56 0x78, meaning "same page, type code 3 (WRITE), 5 bytes to
+follow", "byte offset 0x7f"+0x60+2, bytes 0x23,0x34,0x56,0x78.
+
+The end of the mini-transaction would be indicated by the end byte
+0x00 or 0x01; @see log_sys.get_sequence_bit().
+If log_sys.is_encrypted(), that is followed by 8 bytes of nonce
+(part of initialization vector). That will be followed by 4 bytes
+of CRC-32C of the entire mini-tranasction, excluding the end byte. */
+
+/** Redo log record types. These bit patterns (3 bits) will be written
+to the redo log file, so the existing codes or their interpretation on
+crash recovery must not be changed. */
+enum mrec_type_t
+{
+  /** Free a page. On recovery, it is unnecessary to read the page.
+  The next record for the page (if any) must be INIT_PAGE.
+  After this record has been written, the page may be
+  overwritten with zeros, or discarded or trimmed. */
+  FREE_PAGE= 0,
+  /** Zero-initialize a page. The current byte offset (for subsequent
+  records) will be reset to FIL_PAGE_TYPE. */
+  INIT_PAGE= 0x10,
+  /** Extended record; @see mrec_ext_t */
+  EXTENDED= 0x20,
+  /** Write a string of bytes. Followed by the byte offset (unsigned,
+  relative to the current byte offset, encoded in 1 to 3 bytes) and
+  the bytes to write (at least one). The current byte offset will be
+  set after the last byte written. */
+  WRITE= 0x30,
+  /** Like WRITE, but before the bytes to write, the data_length-1
+  (encoded in 1 to 3 bytes) will be encoded, and it must be more
+  than the length of the following data bytes to write.
+  The data byte(s) will be repeatedly copied to the output until
+  the data_length is reached. */
+  MEMSET= 0x40,
+  /** Like MEMSET, but instead of the bytes to write, a source byte
+  offset (signed, nonzero, relative to the target byte offset, encoded
+  in 1 to 3 bytes, with the sign bit in the least significant bit)
+  will be written.
+  That is, +x is encoded as (x-1)<<1 (+1,+2,+3,... is 0,2,4,...)
+  and -x is encoded as (x-1)<<1|1 (-1,-2,-3,... is 1,3,5,...).
+  The source offset and data_length must be within the page size, or
+  else the record will be treated as corrupted. The data will be
+  copied from the page as it was at the start of the
+  mini-transaction. */
+  MEMMOVE= 0x50,
+  /** Reserved for future use. */
+  RESERVED= 0x60,
+  /** Optional record that may be ignored in crash recovery.
+  A subtype (@see mrec_opt) will be encoded after the page identifier. */
+  OPTION= 0x70
+};
+
+
+/** Supported EXTENDED record subtypes. */
+enum mrec_ext_t
+{
+  /** Partly initialize a ROW_FORMAT=REDUNDANT B-tree or R-tree index page,
+  including writing the "infimum" and "supremum" pseudo-records.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  INIT_ROW_FORMAT_REDUNDANT= 0,
+  /** Partly initialize a ROW_FORMAT=COMPACT or DYNAMIC index page,
+  including writing the "infimum" and "supremum" pseudo-records.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  INIT_ROW_FORMAT_DYNAMIC= 1,
+  /** Initialize an undo log page.
+  This is roughly (not exactly) equivalent to the old MLOG_UNDO_INIT record.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  UNDO_INIT= 2,
+  /** Append a record to an undo log page.
+  This is equivalent to the old MLOG_UNDO_INSERT record.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  UNDO_APPEND= 3,
+  /** Insert a ROW_FORMAT=REDUNDANT record, extending PAGE_HEAP_TOP.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  INSERT_HEAP_REDUNDANT= 4,
+  /** Insert a ROW_FORMAT=REDUNDANT record, reusing PAGE_FREE.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  INSERT_REUSE_REDUNDANT= 5,
+  /** Insert a ROW_FORMAT=COMPACT or DYNAMIC record, extending PAGE_HEAP_TOP.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  INSERT_HEAP_DYNAMIC= 6,
+  /** Insert a ROW_FORMAT=COMPACT or DYNAMIC record, reusing PAGE_FREE.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  INSERT_REUSE_DYNAMIC= 7,
+  /** Delete a record on a ROW_FORMAT=REDUNDANT page.
+  We point to the precedessor of the record to be deleted.
+  The current byte offset will be reset to FIL_PAGE_TYPE.
+  This is similar to the old MLOG_REC_DELETE record. */
+  DELETE_ROW_FORMAT_REDUNDANT= 8,
+  /** Delete a record on a ROW_FORMAT=COMPACT or DYNAMIC page.
+  We point to the precedessor of the record to be deleted
+  and include the total size of the record being deleted.
+  The current byte offset will be reset to FIL_PAGE_TYPE.
+  This is similar to the old MLOG_COMP_REC_DELETE record. */
+  DELETE_ROW_FORMAT_DYNAMIC= 9,
+  /** Truncate a data file. */
+  TRIM_PAGES= 10
+};
+
+
+/** Recognized OPTION record subtypes. */
+enum mrec_opt
+{
+  /** page checksum at the end of the mini-transaction */
+  OPT_PAGE_CHECKSUM= 0
+  /* Other possible subtypes: a binlog record, or an SQL statement. */
+};
+
+
+/** Redo log record types for file-level operations. These bit
+patterns will be written to redo log files, so the existing codes or
+their interpretation on crash recovery must not be changed. */
+enum mfile_type_t
+{
+  /** Create a file. Followed by tablespace ID and the file name. */
+  FILE_CREATE = 0x80,
+  /** Delete a file. Followed by tablespace ID and the file name.  */
+  FILE_DELETE = 0x90,
+  /** Rename a file. Followed by tablespace ID and the old file name,
+  NUL, and the new file name.  */
+  FILE_RENAME = 0xa0,
+  /** Modify a file. Followed by tablespace ID and the file name. */
+  FILE_MODIFY = 0xb0,
+  /** End-of-checkpoint marker, at the end of a mini-transaction.
+  Followed by 2 NUL bytes of page identifier and 8 bytes of LSN;
+  @see SIZE_OF_FILE_CHECKPOINT.
+  When all bytes are NUL, this is a dummy padding record. */
+  FILE_CHECKPOINT = 0xf0
+};
+
+/** Size of a FILE_CHECKPOINT record, including the trailing byte to
+terminate the mini-transaction and the CRC-32C. */
+constexpr byte SIZE_OF_FILE_CHECKPOINT= 3/*type,page_id*/ + 8/*LSN*/ + 1 + 4;
+
+#ifndef UNIV_INNOCHECKSUM
+/** Types for the mlock objects to store in the mtr_t::m_memo */
+enum mtr_memo_type_t {
+	MTR_MEMO_PAGE_S_FIX = RW_S_LATCH,
+
+	MTR_MEMO_PAGE_X_FIX = RW_X_LATCH,
+
+	MTR_MEMO_PAGE_SX_FIX = RW_SX_LATCH,
+
+	MTR_MEMO_BUF_FIX = RW_NO_LATCH,
+
+	MTR_MEMO_MODIFY = 16,
+
+	MTR_MEMO_PAGE_X_MODIFY = MTR_MEMO_PAGE_X_FIX | MTR_MEMO_MODIFY,
+	MTR_MEMO_PAGE_SX_MODIFY = MTR_MEMO_PAGE_SX_FIX | MTR_MEMO_MODIFY,
+
+	MTR_MEMO_S_LOCK = RW_S_LATCH << 5,
+
+	MTR_MEMO_X_LOCK = RW_X_LATCH << 5,
+
+	MTR_MEMO_SX_LOCK = RW_SX_LATCH << 5,
+
+	/** wr_lock() on fil_space_t::latch */
+	MTR_MEMO_SPACE_X_LOCK = MTR_MEMO_SX_LOCK << 1
+};
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h
new file mode 100644
index 00000000..c9db6a1f
--- /dev/null
+++ b/storage/innobase/include/os0file.h
@@ -0,0 +1,1188 @@
+/***********************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file include/os0file.h
+The interface to the operating system file io
+
+Created 10/21/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0file_h
+#define os0file_h
+
+#include "fsp0types.h"
+#include "tpool.h"
+#include "my_counter.h"
+
+#ifndef _WIN32
+#include <dirent.h>
+#include <sys/stat.h>
+#include <time.h>
+#endif /* !_WIN32 */
+
+extern bool	os_has_said_disk_full;
+
+/** File offset in bytes */
+typedef ib_uint64_t os_offset_t;
+
+class buf_tmp_buffer_t;
+
+#ifdef _WIN32
+
+/** We define always WIN_ASYNC_IO, and check at run-time whether
+the OS actually supports it: Win 95 does not, NT does. */
+# define WIN_ASYNC_IO
+
+/** Use unbuffered I/O */
+# define UNIV_NON_BUFFERED_IO
+
+/** File handle */
+typedef native_file_handle os_file_t;
+
+
+#else /* _WIN32 */
+
+/** File handle */
+typedef int	os_file_t;
+
+#endif /* _WIN32 */
+
+static const os_file_t OS_FILE_CLOSED = IF_WIN(os_file_t(INVALID_HANDLE_VALUE),-1);
+
+/** File descriptor with optional PERFORMANCE_SCHEMA instrumentation */
+struct pfs_os_file_t
+{
+	/** Default constructor */
+	pfs_os_file_t(os_file_t file = OS_FILE_CLOSED) : m_file(file)
+#ifdef UNIV_PFS_IO
+	, m_psi(NULL)
+#endif
+	{}
+
+	/** The wrapped file handle */
+	os_file_t   m_file;
+#ifdef UNIV_PFS_IO
+	/** PERFORMANCE_SCHEMA descriptor */
+	struct PSI_file *m_psi;
+#endif
+	/** Implicit type conversion.
+	@return the wrapped file handle */
+	operator os_file_t() const { return m_file; }
+	/** Assignment operator.
+	@param[in]	file	file handle to be assigned */
+	void operator=(os_file_t file) { m_file = file; }
+	bool operator==(os_file_t file) const { return m_file == file; }
+	bool operator!=(os_file_t file) const { return !(*this == file); }
+#ifndef DBUG_OFF
+	friend std::ostream& operator<<(std::ostream& os, pfs_os_file_t f){
+		os << os_file_t(f);
+		return os;
+	}
+#endif
+};
+
+/** Options for os_file_create_func @{ */
+enum os_file_create_t {
+	OS_FILE_OPEN = 51,		/*!< to open an existing file (if
+					doesn't exist, error) */
+	OS_FILE_CREATE,			/*!< to create new file (if
+					exists, error) */
+	OS_FILE_OVERWRITE,		/*!< to create a new file, if exists
+					the overwrite old file */
+	OS_FILE_OPEN_RAW,		/*!< to open a raw device or disk
+					partition */
+	OS_FILE_CREATE_PATH,		/*!< to create the directories */
+	OS_FILE_OPEN_RETRY,		/*!< open with retry */
+
+	/** Flags that can be combined with the above values. Please ensure
+	that the above values stay below 128. */
+
+	OS_FILE_ON_ERROR_NO_EXIT = 128,	/*!< do not exit on unknown errors */
+	OS_FILE_ON_ERROR_SILENT = 256	/*!< don't print diagnostic messages to
+					the log unless it is a fatal error,
+					this flag is only used if
+					ON_ERROR_NO_EXIT is set */
+};
+
+static const ulint OS_FILE_READ_ONLY = 333;
+static const ulint OS_FILE_READ_WRITE = 444;
+
+/** Used by MySQLBackup */
+static const ulint OS_FILE_READ_ALLOW_DELETE = 555;
+
+/* Options for file_create */
+static const ulint OS_FILE_AIO = 61;
+static const ulint OS_FILE_NORMAL = 62;
+/* @} */
+
+/** Types for file create @{ */
+static const ulint OS_DATA_FILE = 100;
+static const ulint OS_LOG_FILE = 101;
+static const ulint OS_DATA_FILE_NO_O_DIRECT = 103;
+/* @} */
+
+/** Error codes from os_file_get_last_error @{ */
+static const ulint OS_FILE_NAME_TOO_LONG = 36;
+static const ulint OS_FILE_NOT_FOUND = 71;
+static const ulint OS_FILE_DISK_FULL = 72;
+static const ulint OS_FILE_ALREADY_EXISTS = 73;
+static const ulint OS_FILE_PATH_ERROR = 74;
+
+/** wait for OS aio resources to become available again */
+static const ulint OS_FILE_AIO_RESOURCES_RESERVED = 75;
+
+static const ulint OS_FILE_SHARING_VIOLATION = 76;
+static const ulint OS_FILE_ERROR_NOT_SPECIFIED = 77;
+static const ulint OS_FILE_INSUFFICIENT_RESOURCE = 78;
+static const ulint OS_FILE_AIO_INTERRUPTED = 79;
+static const ulint OS_FILE_OPERATION_ABORTED = 80;
+static const ulint OS_FILE_ACCESS_VIOLATION = 81;
+static const ulint OS_FILE_OPERATION_NOT_SUPPORTED = 125;
+static const ulint OS_FILE_ERROR_MAX = 200;
+/* @} */
+
+/**
+The I/O context that is passed down to the low level IO code */
+class IORequest
+{
+public:
+  enum Type
+  {
+    /** Synchronous read */
+    READ_SYNC= 2,
+    /** Asynchronous read; some errors will be ignored */
+    READ_ASYNC= READ_SYNC | 1,
+    /** Possibly partial read; only used with
+    os_file_read_no_error_handling() */
+    READ_MAYBE_PARTIAL= READ_SYNC | 4,
+    /** Read for doublewrite buffer recovery */
+    DBLWR_RECOVER= READ_SYNC | 8,
+    /** Synchronous write */
+    WRITE_SYNC= 16,
+    /** Asynchronous write */
+    WRITE_ASYNC= WRITE_SYNC | 1,
+    /** A doublewrite batch */
+    DBLWR_BATCH= WRITE_ASYNC | 8,
+    /** Write data; evict the block on write completion */
+    WRITE_LRU= WRITE_ASYNC | 32,
+    /** Write data and punch hole for the rest */
+    PUNCH= WRITE_ASYNC | 64,
+    /** Write data and punch hole; evict the block on write completion */
+    PUNCH_LRU= PUNCH | WRITE_LRU,
+    /** Zero out a range of bytes in fil_space_t::io() */
+    PUNCH_RANGE= WRITE_SYNC | 128,
+  };
+
+  constexpr IORequest(buf_page_t *bpage, buf_tmp_buffer_t *slot,
+                      fil_node_t *node, Type type) :
+    bpage(bpage), slot(slot), node(node), type(type) {}
+
+  constexpr IORequest(Type type= READ_SYNC, buf_page_t *bpage= nullptr,
+                      buf_tmp_buffer_t *slot= nullptr) :
+    bpage(bpage), slot(slot), type(type) {}
+
+  bool is_read() const { return (type & READ_SYNC) != 0; }
+  bool is_write() const { return (type & WRITE_SYNC) != 0; }
+  bool is_LRU() const { return (type & (WRITE_LRU ^ WRITE_ASYNC)) != 0; }
+  bool is_async() const { return (type & (READ_SYNC ^ READ_ASYNC)) != 0; }
+
+  void write_complete(int io_error) const;
+  void read_complete(int io_error) const;
+  void fake_read_complete(os_offset_t offset) const;
+
+  /** If requested, free storage space associated with a section of the file.
+  @param off   byte offset from the start (SEEK_SET)
+  @param len   size of the hole in bytes
+  @return DB_SUCCESS or error code */
+  dberr_t maybe_punch_hole(os_offset_t off, ulint len)
+  {
+    return off && len && node && (type & (PUNCH ^ WRITE_ASYNC))
+      ? punch_hole(off, len)
+      : DB_SUCCESS;
+  }
+
+private:
+  /** Free storage space associated with a section of the file.
+  @param off   byte offset from the start (SEEK_SET)
+  @param len   size of the hole in bytes
+  @return DB_SUCCESS or error code */
+  dberr_t punch_hole(os_offset_t off, ulint len) const;
+
+public:
+  /** Page to be written on write operation */
+  buf_page_t *const bpage= nullptr;
+
+  /** Memory to be used for encrypted or page_compressed pages */
+  buf_tmp_buffer_t *const slot= nullptr;
+
+  /** File descriptor */
+  fil_node_t *const node= nullptr;
+
+  /** Request type bit flags */
+  const Type type;
+};
+
+constexpr IORequest IORequestRead(IORequest::READ_SYNC);
+constexpr IORequest IORequestReadPartial(IORequest::READ_MAYBE_PARTIAL);
+constexpr IORequest IORequestWrite(IORequest::WRITE_SYNC);
+
+/** Sparse file size information. */
+struct os_file_size_t {
+	/** Total size of file in bytes */
+	os_offset_t	m_total_size;
+
+	/** If it is a sparse file then this is the number of bytes
+	actually allocated for the file. */
+	os_offset_t	m_alloc_size;
+};
+
+constexpr ulint OS_AIO_N_PENDING_IOS_PER_THREAD= 256;
+
+extern Atomic_counter<ulint> os_n_file_reads;
+extern Atomic_counter<size_t> os_n_file_writes;
+extern Atomic_counter<size_t> os_n_fsyncs;
+
+/* File types for directory entry data type */
+
+enum os_file_type_t {
+	OS_FILE_TYPE_UNKNOWN = 0,
+	OS_FILE_TYPE_FILE,			/* regular file */
+	OS_FILE_TYPE_DIR,			/* directory */
+	OS_FILE_TYPE_LINK,			/* symbolic link */
+	OS_FILE_TYPE_BLOCK			/* block device */
+};
+
+/* Maximum path string length in bytes when referring to tables with in the
+'./databasename/tablename.ibd' path format; we can allocate at least 2 buffers
+of this size from the thread stack; that is why this should not be made much
+bigger than 4000 bytes.  The maximum path length used by any storage engine
+in the server must be at least this big. */
+
+/* MySQL 5.7 my_global.h */
+#ifndef FN_REFLEN_SE
+#define FN_REFLEN_SE        4000
+#endif
+
+#define OS_FILE_MAX_PATH	4000
+#if (FN_REFLEN_SE < OS_FILE_MAX_PATH)
+# error "(FN_REFLEN_SE < OS_FILE_MAX_PATH)"
+#endif
+
+/** Struct used in fetching information of a file in a directory */
+struct os_file_stat_t {
+	char		name[OS_FILE_MAX_PATH];	/*!< path to a file */
+	os_file_type_t	type;			/*!< file type */
+	os_offset_t	size;			/*!< file size in bytes */
+	os_offset_t	alloc_size;		/*!< Allocated size for
+						sparse files in bytes */
+	size_t		block_size;		/*!< Block size to use for IO
+						in bytes*/
+	time_t		ctime;			/*!< creation time */
+	time_t		mtime;			/*!< modification time */
+	time_t		atime;			/*!< access time */
+	bool		rw_perm;		/*!< true if can be opened
+						in read-write mode. Only valid
+						if type == OS_FILE_TYPE_FILE */
+};
+
+/** Create a temporary file. This function is like tmpfile(3), but
+the temporary file is created in the in the mysql server configuration
+parameter (--tmpdir).
+@return temporary file handle, or NULL on error */
+FILE*
+os_file_create_tmpfile();
+
+/**
+This function attempts to create a directory named pathname. The new directory
+gets default permissions. On Unix, the permissions are (0770 & ~umask). If the
+directory exists already, nothing is done and the call succeeds, unless the
+fail_if_exists arguments is true.
+
+@param[in]	pathname	directory name as null-terminated string
+@param[in]	fail_if_exists	if true, pre-existing directory is treated
+				as an error.
+@return true if call succeeds, false on error */
+bool
+os_file_create_directory(
+	const char*	pathname,
+	bool		fail_if_exists);
+
+/** NOTE! Use the corresponding macro os_file_create_simple(), not directly
+this function!
+A simple function to open or create a file.
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
+@param[in]	read_only	if true read only mode checks are enforced
+@param[out]	success		true if succeed, false if error
+@return own: handle to the file, not defined if error, error number
+	can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_simple_func(
+	const char*	name,
+	ulint		create_mode,
+	ulint		access_type,
+	bool		read_only,
+	bool*		success);
+
+/** NOTE! Use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A simple function to open or create a file.
+@param[in]	name		name of the file or path as a null-terminated string
+@param[in]	create_mode	create mode
+@param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
+				OS_FILE_READ_ALLOW_DELETE; the last option
+				is used by a backup program reading the file
+@param[in]	read_only	if true read only mode checks are enforced
+@param[out]	success		true if succeeded
+@return own: handle to the file, not defined if error, error number
+	can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_simple_no_error_handling_func(
+	const char*	name,
+	ulint		create_mode,
+	ulint		access_type,
+	bool		read_only,
+	bool*		success)
+	MY_ATTRIBUTE((warn_unused_result));
+
+#ifdef  _WIN32
+#define os_file_set_nocache(fd, file_name, operation_name) do{}while(0)
+#else
+/** Tries to disable OS caching on an opened file descriptor.
+@param[in]	fd		file descriptor to alter
+@param[in]	file_name	file name, used in the diagnostic message
+@param[in]	name		"open" or "create"; used in the diagnostic
+				message */
+void
+os_file_set_nocache(
+/*================*/
+	int	fd,		/*!< in: file descriptor to alter */
+	const char*	file_name,
+	const char*	operation_name);
+#endif
+
+#ifndef _WIN32 /* On Microsoft Windows, mandatory locking is used */
+/** Obtain an exclusive lock on a file.
+@param fd      file descriptor
+@param name    file name
+@return 0 on success */
+int os_file_lock(int fd, const char *name);
+#endif
+
+/** NOTE! Use the corresponding macro os_file_create(), not directly
+this function!
+Opens an existing file or creates a new.
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	purpose		OS_FILE_AIO, if asynchronous, non-buffered I/O
+				is desired, OS_FILE_NORMAL, if any normal file;
+				NOTE that it also depends on type, os_aio_..
+				and srv_.. variables whether we really use
+				async I/O or unbuffered I/O: look in the
+				function source code for the exact rules
+@param[in]	type		OS_DATA_FILE or OS_LOG_FILE
+@param[in]	read_only	if true read only mode checks are enforced
+@param[in]	success		true if succeeded
+@return own: handle to the file, not defined if error, error number
+	can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_func(
+	const char*	name,
+	ulint		create_mode,
+	ulint		purpose,
+	ulint		type,
+	bool		read_only,
+	bool*		success)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Deletes a file. The file has to be closed before calling this.
+@param[in]	name		file path as a null-terminated string
+@return true if success */
+bool
+os_file_delete_func(const char* name);
+
+/** Deletes a file if it exists. The file has to be closed before calling this.
+@param[in]	name		file path as a null-terminated string
+@param[out]	exist		indicate if file pre-exist
+@return true if success */
+bool
+os_file_delete_if_exists_func(const char* name, bool* exist);
+
+/** NOTE! Use the corresponding macro os_file_rename(), not directly
+this function!
+Renames a file (can also move it to another directory). It is safest that the
+file is closed before calling this function.
+@param[in]	oldpath		old file path as a null-terminated string
+@param[in]	newpath		new file path
+@return true if success */
+bool
+os_file_rename_func(const char* oldpath, const char* newpath);
+
+/** NOTE! Use the corresponding macro os_file_close(), not directly this
+function!
+Closes a file handle. In case of error, error number can be retrieved with
+os_file_get_last_error.
+@param[in]	file		own: handle to a file
+@return true if success */
+bool os_file_close_func(os_file_t file);
+
+#ifdef UNIV_PFS_IO
+
+/* Keys to register InnoDB I/O with performance schema */
+extern mysql_pfs_key_t	innodb_data_file_key;
+extern mysql_pfs_key_t	innodb_temp_file_key;
+
+/* Following four macros are instumentations to register
+various file I/O operations with performance schema.
+1) register_pfs_file_open_begin() and register_pfs_file_open_end() are
+used to register file creation, opening, closing and renaming.
+2) register_pfs_file_rename_begin() and  register_pfs_file_rename_end()
+are used to register file renaming
+2) register_pfs_file_io_begin() and register_pfs_file_io_end() are
+used to register actual file read, write and flush
+3) register_pfs_file_close_begin() and register_pfs_file_close_end()
+are used to register file deletion operations*/
+# define register_pfs_file_open_begin(state, locker, key, op, name,	\
+				      src_file, src_line)		\
+do {									\
+	locker = PSI_FILE_CALL(get_thread_file_name_locker)(		\
+		state, key, op, name, &locker);				\
+	if (locker != NULL) {						\
+		PSI_FILE_CALL(start_file_open_wait)(			\
+			locker, src_file, src_line);			\
+	}								\
+} while (0)
+
+# define register_pfs_file_open_end(locker, file, result)		\
+do {									\
+	if (locker != NULL) {						\
+		file.m_psi = PSI_FILE_CALL(end_file_open_wait)(	\
+			locker, result);				\
+	}								\
+} while (0)
+
+# define register_pfs_file_rename_begin(state, locker, key, op, name,	\
+				src_file, src_line)			\
+	register_pfs_file_open_begin(state, locker, key, op, name,	\
+					src_file, src_line)		\
+
+# define register_pfs_file_rename_end(locker, from, to, result)		\
+do {									\
+	if (locker != NULL) {						\
+		 PSI_FILE_CALL(						\
+			end_file_rename_wait)(				\
+			locker, from, to, result);			\
+	}								\
+} while (0)
+
+# define register_pfs_file_close_begin(state, locker, key, op, name,	\
+				      src_file, src_line)		\
+do {									\
+	locker = PSI_FILE_CALL(get_thread_file_name_locker)(		\
+		state, key, op, name, &locker);				\
+	if (locker != NULL) {						\
+		PSI_FILE_CALL(start_file_close_wait)(			\
+			locker, src_file, src_line);			\
+	}								\
+} while (0)
+
+# define register_pfs_file_close_end(locker, result)			\
+do {									\
+	if (locker != NULL) {						\
+		PSI_FILE_CALL(end_file_close_wait)(			\
+			locker, result);				\
+	}								\
+} while (0)
+
+# define register_pfs_file_io_begin(state, locker, file, count, op,	\
+				    src_file, src_line)			\
+do {									\
+	locker = PSI_FILE_CALL(get_thread_file_stream_locker)(		\
+		state, file.m_psi, op);					\
+	if (locker != NULL) {						\
+		PSI_FILE_CALL(start_file_wait)(				\
+			locker, count, src_file, src_line);		\
+	}								\
+} while (0)
+
+# define register_pfs_file_io_end(locker, count)			\
+do {									\
+	if (locker != NULL) {						\
+		PSI_FILE_CALL(end_file_wait)(locker, count);		\
+	}								\
+} while (0)
+
+/* Following macros/functions are file I/O APIs that would be performance
+schema instrumented if "UNIV_PFS_IO" is defined. They would point to
+wrapper functions with performance schema instrumentation in such case.
+
+os_file_create
+os_file_create_simple
+os_file_create_simple_no_error_handling
+os_file_close
+os_file_rename
+os_aio
+os_file_read
+os_file_read_no_error_handling
+os_file_write
+
+The wrapper functions have the prefix of "innodb_". */
+
+# define os_file_create(key, name, create, purpose, type, read_only,	\
+			success)					\
+	pfs_os_file_create_func(key, name, create, purpose,	type,	\
+		read_only, success, __FILE__, __LINE__)
+
+# define os_file_create_simple(key, name, create, access,		\
+		read_only, success)					\
+	pfs_os_file_create_simple_func(key, name, create, access,	\
+		read_only, success, __FILE__, __LINE__)
+
+# define os_file_create_simple_no_error_handling(			\
+	key, name, create_mode, access, read_only, success)		\
+	pfs_os_file_create_simple_no_error_handling_func(		\
+		key, name, create_mode, access,				\
+		read_only, success, __FILE__, __LINE__)
+
+# define os_file_close(file)						\
+	pfs_os_file_close_func(file, __FILE__, __LINE__)
+
+# define os_file_read(type, file, buf, offset, n, o)			\
+	pfs_os_file_read_func(type, file, buf, offset, n,o, __FILE__, __LINE__)
+
+# define os_file_write(type, name, file, buf, offset, n)	\
+	pfs_os_file_write_func(type, name, file, buf, offset,	\
+			       n, __FILE__, __LINE__)
+
+# define os_file_flush(file)					\
+	pfs_os_file_flush_func(file, __FILE__, __LINE__)
+
+# define os_file_rename(key, oldpath, newpath)				\
+	pfs_os_file_rename_func(key, oldpath, newpath, __FILE__, __LINE__)
+
+# define os_file_delete(key, name)					\
+	pfs_os_file_delete_func(key, name, __FILE__, __LINE__)
+
+# define os_file_delete_if_exists(key, name, exist)			\
+	pfs_os_file_delete_if_exists_func(key, name, exist, __FILE__, __LINE__)
+
+/** NOTE! Please use the corresponding macro os_file_create_simple(),
+not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple() which opens or creates a file.
+@param[in]	key		Performance Schema Key
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
+@param[in]	read_only	if true read only mode checks are enforced
+@param[out]	success		true if succeeded
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return own: handle to the file, not defined if error, error number
+	can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_simple_func(
+	mysql_pfs_key_t key,
+	const char*	name,
+	ulint		create_mode,
+	ulint		access_type,
+	bool		read_only,
+	bool*		success,
+	const char*	src_file,
+	uint		src_line)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** NOTE! Please use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple_no_error_handling(). Add instrumentation to
+monitor file creation/open.
+@param[in]	key		Performance Schema Key
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
+				OS_FILE_READ_ALLOW_DELETE; the last option is
+				used by a backup program reading the file
+@param[in]	read_only	if true read only mode checks are enforced
+@param[out]	success		true if succeeded
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return own: handle to the file, not defined if error, error number
+	can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_simple_no_error_handling_func(
+	mysql_pfs_key_t key,
+	const char*	name,
+	ulint		create_mode,
+	ulint		access_type,
+	bool		read_only,
+	bool*		success,
+	const char*	src_file,
+	uint		src_line)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** NOTE! Please use the corresponding macro os_file_create(), not directly
+this function!
+A performance schema wrapper function for os_file_create().
+Add instrumentation to monitor file creation/open.
+@param[in]	key		Performance Schema Key
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	purpose		OS_FILE_AIO, if asynchronous, non-buffered I/O
+				is desired, OS_FILE_NORMAL, if any normal file;
+				NOTE that it also depends on type, os_aio_..
+				and srv_.. variables whether we really use
+				async I/O or unbuffered I/O: look in the
+				function source code for the exact rules
+@param[in]	read_only	if true read only mode checks are enforced
+@param[out]	success		true if succeeded
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return own: handle to the file, not defined if error, error number
+	can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_func(
+	mysql_pfs_key_t key,
+	const char*	name,
+	ulint		create_mode,
+	ulint		purpose,
+	ulint		type,
+	bool		read_only,
+	bool*		success,
+	const char*	src_file,
+	uint		src_line)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** NOTE! Please use the corresponding macro os_file_close(), not directly
+this function!
+A performance schema instrumented wrapper function for os_file_close().
+@param[in]	file		handle to a file
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_close_func(
+	pfs_os_file_t	file,
+	const char*	src_file,
+	uint		src_line);
+
+/** NOTE! Please use the corresponding macro os_file_read(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_read() which requests a synchronous read operation.
+@param[in]	type		IO request context
+@param[in]	file		Open file handle
+@param[out]	buf		buffer where to read
+@param[in]	offset		file offset where to read
+@param[in]	n		number of bytes to read
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return DB_SUCCESS if request was successful */
+UNIV_INLINE
+dberr_t
+pfs_os_file_read_func(
+	const IORequest&	type,
+	pfs_os_file_t		file,
+	void*			buf,
+	os_offset_t		offset,
+	ulint			n,
+	ulint*			o,
+	const char*		src_file,
+	uint			src_line);
+
+/** NOTE! Please use the corresponding macro os_file_write(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_write() which requests a synchronous write operation.
+@param[in]	type		IO request context
+@param[in]	name		Name of the file or path as NUL terminated
+				string
+@param[in]	file		Open file handle
+@param[out]	buf		buffer where to read
+@param[in]	offset		file offset where to read
+@param[in]	n		number of bytes to read
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return DB_SUCCESS if request was successful */
+UNIV_INLINE
+dberr_t
+pfs_os_file_write_func(
+	const IORequest&	type,
+	const char*		name,
+	pfs_os_file_t		file,
+	const void*		buf,
+	os_offset_t		offset,
+	ulint			n,
+	const char*		src_file,
+	uint			src_line);
+
+/** NOTE! Please use the corresponding macro os_file_flush(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_flush() which flushes the write buffers of a given file to the disk.
+Flushes the write buffers of a given file to the disk.
+@param[in]	file		Open file handle
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return TRUE if success */
+UNIV_INLINE
+bool
+pfs_os_file_flush_func(
+	pfs_os_file_t	file,
+	const char*	src_file,
+	uint		src_line);
+
+
+/** NOTE! Please use the corresponding macro os_file_rename(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_rename()
+@param[in]	key		Performance Schema Key
+@param[in]	oldpath		old file path as a null-terminated string
+@param[in]	newpath		new file path
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_rename_func(
+	mysql_pfs_key_t	key,
+	const char*	oldpath,
+	const char*	newpath,
+	const char*	src_file,
+	uint		src_line);
+
+/**
+NOTE! Please use the corresponding macro os_file_delete(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete()
+@param[in]	key		Performance Schema Key
+@param[in]	name		old file path as a null-terminated string
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_func(
+	mysql_pfs_key_t	key,
+	const char*	name,
+	const char*	src_file,
+	uint		src_line);
+
+/**
+NOTE! Please use the corresponding macro os_file_delete_if_exists(), not
+directly this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete_if_exists()
+@param[in]	key		Performance Schema Key
+@param[in]	name		old file path as a null-terminated string
+@param[in]	exist		indicate if file pre-exist
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_if_exists_func(
+	mysql_pfs_key_t	key,
+	const char*	name,
+	bool*		exist,
+	const char*	src_file,
+	uint		src_line);
+
+#else /* UNIV_PFS_IO */
+
+/* If UNIV_PFS_IO is not defined, these I/O APIs point
+to original un-instrumented file I/O APIs */
+# define os_file_create(key, name, create, purpose, type, read_only,	\
+			success)					\
+	os_file_create_func(name, create, purpose, type, read_only,	\
+			success)
+
+# define os_file_create_simple(key, name, create_mode, access,		\
+		read_only, success)					\
+	os_file_create_simple_func(name, create_mode, access,		\
+		read_only, success)
+
+# define os_file_create_simple_no_error_handling(			\
+	key, name, create_mode, access, read_only, success)		\
+	os_file_create_simple_no_error_handling_func(			\
+		name, create_mode, access, read_only, success)
+
+# define os_file_close(file)	os_file_close_func(file)
+
+# define os_file_read(type, file, buf, offset, n, o)		\
+	os_file_read_func(type, file, buf, offset, n, o)
+
+# define os_file_write(type, name, file, buf, offset, n)	\
+	os_file_write_func(type, name, file, buf, offset, n)
+
+# define os_file_flush(file)	os_file_flush_func(file)
+
+# define os_file_rename(key, oldpath, newpath)				\
+	os_file_rename_func(oldpath, newpath)
+
+# define os_file_delete(key, name)	os_file_delete_func(name)
+
+# define os_file_delete_if_exists(key, name, exist)			\
+	os_file_delete_if_exists_func(name, exist)
+
+#endif	/* UNIV_PFS_IO */
+
+/** Gets a file size.
+@param[in]	file		handle to a file
+@return file size if OK, else set m_total_size to ~0 and m_alloc_size
+	to errno */
+os_file_size_t
+os_file_get_size(
+	const char*	filename)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Gets a file size.
+@param[in]	file		handle to a file
+@return file size, or (os_offset_t) -1 on failure */
+os_offset_t
+os_file_get_size(
+	os_file_t	file)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Extend a file.
+
+On Windows, extending a file allocates blocks for the file,
+unless the file is sparse.
+
+On Unix, we will extend the file with ftruncate(), if
+file needs to be sparse. Otherwise posix_fallocate() is used
+when available, and if not, binary zeroes are added to the end
+of file.
+
+@param[in]	name	file name
+@param[in]	file	file handle
+@param[in]	size	desired file size
+@param[in]	sparse	whether to create a sparse file (no preallocating)
+@return	whether the operation succeeded */
+bool
+os_file_set_size(
+	const char*	name,
+	os_file_t	file,
+	os_offset_t	size,
+	bool		is_sparse = false)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Truncates a file at its current position.
+@param[in/out]	file	file to be truncated
+@return true if success */
+bool
+os_file_set_eof(
+	FILE*		file);	/*!< in: file to be truncated */
+
+/** Truncate a file to a specified size in bytes.
+@param[in]	pathname	file path
+@param[in]	file		file to be truncated
+@param[in]	size		size preserved in bytes
+@param[in]	allow_shrink	whether to allow the file to become smaller
+@return true if success */
+bool
+os_file_truncate(
+	const char*	pathname,
+	os_file_t	file,
+	os_offset_t	size,
+	bool		allow_shrink = false);
+
+/** NOTE! Use the corresponding macro os_file_flush(), not directly this
+function!
+Flushes the write buffers of a given file to the disk.
+@param[in]	file		handle to a file
+@return true if success */
+bool
+os_file_flush_func(
+	os_file_t	file);
+
+/** Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + OS_FILE_ERROR_MAX is returned.
+@param[in]	report_all_errors	true if we want an error message
+                                        printed of all errors
+@param[in]	on_error_silent		true then don't print any diagnostic
+                                        to the log
+@return error number, or OS error number + OS_FILE_ERROR_MAX */
+ulint os_file_get_last_error(bool report_all_errors,
+                             bool on_error_silent= false);
+
+/** NOTE! Use the corresponding macro os_file_read(), not directly this
+function!
+Requests a synchronous read operation.
+@param[in]	type		IO request context
+@param[in]	file		Open file handle
+@param[out]	buf		buffer where to read
+@param[in]	offset		file offset where to read
+@param[in]	n		number of bytes to read
+@param[out]	o		number of bytes actually read
+@return DB_SUCCESS if request was successful */
+dberr_t
+os_file_read_func(
+	const IORequest&	type,
+	os_file_t		file,
+	void*			buf,
+	os_offset_t		offset,
+	ulint			n,
+	ulint*			o)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Rewind file to its start, read at most size - 1 bytes from it to str, and
+NUL-terminate str. All errors are silently ignored. This function is
+mostly meant to be used with temporary files.
+@param[in,out]	file		file to read from
+@param[in,out]	str		buffer where to read
+@param[in]	size		size of buffer */
+void
+os_file_read_string(
+	FILE*		file,
+	char*		str,
+	ulint		size);
+
+/** NOTE! Use the corresponding macro os_file_write(), not directly this
+function!
+Requests a synchronous write operation.
+@param[in]	type		IO request context
+@param[in]	file		Open file handle
+@param[out]	buf		buffer where to read
+@param[in]	offset		file offset where to read
+@param[in]	n		number of bytes to read
+@return DB_SUCCESS if request was successful */
+dberr_t
+os_file_write_func(
+	const IORequest&	type,
+	const char*		name,
+	os_file_t		file,
+	const void*		buf,
+	os_offset_t		offset,
+	ulint			n)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Check the existence and type of the given file.
+@param[in]	path		pathname of the file
+@param[out]	exists		true if file exists
+@param[out]	type		type of the file (if it exists)
+@return true if call succeeded */
+bool
+os_file_status(
+	const char*	path,
+	bool*		exists,
+	os_file_type_t* type);
+
+/** This function reduces a null-terminated full remote path name into
+the path that is sent by MySQL for DATA DIRECTORY clause.  It replaces
+the 'databasename/tablename.ibd' found at the end of the path with just
+'tablename'.
+
+Since the result is always smaller than the path sent in, no new memory
+is allocated. The caller should allocate memory for the path sent in.
+This function manipulates that path in place.
+
+If the path format is not as expected, just return.  The result is used
+to inform a SHOW CREATE TABLE command.
+@param[in,out]	data_dir_path		Full path/data_dir_path */
+void
+os_file_make_data_dir_path(
+	char*	data_dir_path);
+
+/** Create all missing subdirectories along the given path.
+@return DB_SUCCESS if OK, otherwise error code. */
+dberr_t
+os_file_create_subdirs_if_needed(
+	const char*	path);
+
+#ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
+/* Test the function os_file_get_parent_dir. */
+void
+unit_test_os_file_get_parent_dir();
+#endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
+
+/**
+Initializes the asynchronous io system. */
+int os_aio_init();
+
+/**
+Frees the asynchronous io system. */
+void os_aio_free();
+
+/** Submit a fake read request during crash recovery.
+@param type   fake read request
+@param offset additional context */
+void os_fake_read(const IORequest &type, os_offset_t offset);
+
+/** Request a read or write.
+@param type		I/O request
+@param buf		buffer
+@param offset		file offset
+@param n		number of bytes
+@retval DB_SUCCESS if request was queued successfully
+@retval DB_IO_ERROR on I/O error */
+dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n);
+
+/** @return number of pending reads */
+size_t os_aio_pending_reads();
+/** @return approximate number of pending reads */
+size_t os_aio_pending_reads_approx();
+/** @return number of pending writes */
+size_t os_aio_pending_writes();
+
+/** Wait until there are no pending asynchronous writes.
+@param declare  whether the wait will be declared in tpool */
+void os_aio_wait_until_no_pending_writes(bool declare);
+
+/** Wait until all pending asynchronous reads have completed.
+@param declare  whether the wait will be declared in tpool */
+void os_aio_wait_until_no_pending_reads(bool declare);
+
+/** Prints info of the aio arrays.
+@param[in/out]	file		file where to print */
+void
+os_aio_print(FILE* file);
+
+/** Refreshes the statistics used to print per-second averages. */
+void
+os_aio_refresh_stats();
+
+/** Checks that all slots in the system have been freed, that is, there are
+no pending io operations. */
+bool
+os_aio_all_slots_free();
+
+
+/** This function returns information about the specified file
+@param[in]	path		pathname of the file
+@param[in]	stat_info	information of a file in a directory
+@param[in]	check_rw_perm	for testing whether the file can be opened
+				in RW mode
+@param[in]	read_only	if true read only mode checks are enforced
+@return DB_SUCCESS if all OK */
+dberr_t
+os_file_get_status(
+	const char*	path,
+	os_file_stat_t* stat_info,
+	bool		check_rw_perm,
+	bool		read_only);
+
+/** Set the file create umask
+@param[in]	umask		The umask to use for file creation. */
+void
+os_file_set_umask(ulint umask);
+
+#ifdef _WIN32
+
+/**
+Make file sparse, on Windows.
+
+@param[in]	file  file handle
+@param[in]	is_sparse if true, make file sparse,
+			otherwise "unsparse" the file
+@return true on success, false on error */
+bool os_file_set_sparse_win32(os_file_t file, bool is_sparse = true);
+
+/**
+Changes file size on Windows
+
+If file is extended, following happens  the bytes between
+old and new EOF are zeros.
+
+If file is sparse, "virtual" block is added at the end of
+allocated area.
+
+If file is normal, file system allocates storage.
+
+@param[in]	pathname	file path
+@param[in]	file		file handle
+@param[in]	size		size to preserve in bytes
+@return true if success */
+bool
+os_file_change_size_win32(
+	const char*	pathname,
+	os_file_t	file,
+	os_offset_t	size);
+
+#endif /*_WIN32 */
+
+/** Free storage space associated with a section of the file.
+@param[in]	fh		Open file handle
+@param[in]	off		Starting offset (SEEK_SET)
+@param[in]	len		Size of the hole
+@return DB_SUCCESS or error code */
+dberr_t
+os_file_punch_hole(
+	os_file_t	fh,
+	os_offset_t	off,
+	os_offset_t	len)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/* Determine if a path is an absolute path or not.
+@param[in]	OS directory or file path to evaluate
+@retval true if an absolute path
+@retval false if a relative path */
+inline bool is_absolute_path(const char *path)
+{
+  switch (path[0]) {
+#ifdef _WIN32
+  case '\0':
+    return false;
+  case '\\':
+#endif
+  case '/':
+    return true;
+  }
+
+#ifdef _WIN32
+  if (path[1] == ':')
+  {
+    switch (path[2]) {
+    case '/':
+    case '\\':
+      return true;
+    }
+  }
+#endif /* _WIN32 */
+
+  return false;
+}
+
+#include "os0file.inl"
+
+#endif /* os0file_h */
diff --git a/storage/innobase/include/os0file.inl b/storage/innobase/include/os0file.inl
new file mode 100644
index 00000000..7de31505
--- /dev/null
+++ b/storage/innobase/include/os0file.inl
@@ -0,0 +1,412 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0file.ic
+The interface to the operating system file io
+
+Created 2/20/2010 Jimmy Yang
+*******************************************************/
+
+#ifdef UNIV_PFS_IO
+/** NOTE! Please use the corresponding macro os_file_create_simple(),
+not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple() which opens or creates a file.
+@param[in]	key		Performance Schema Key
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
+@param[in]	read_only	if true read only mode checks are enforced
+@param[out]	success		true if succeeded
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_simple_func(
+	mysql_pfs_key_t key,
+	const char*	name,
+	ulint		create_mode,
+	ulint		access_type,
+	bool		read_only,
+	bool*		success,
+	const char*	src_file,
+	uint		src_line)
+{
+	PSI_file_locker_state	state;
+	struct PSI_file_locker* locker = NULL;
+
+	/* register a file open or creation depending on "create_mode" */
+	register_pfs_file_open_begin(
+		&state, locker, key,
+		(create_mode == OS_FILE_CREATE)
+		? PSI_FILE_CREATE : PSI_FILE_OPEN,
+		name, src_file, src_line);
+
+	pfs_os_file_t	file = os_file_create_simple_func(
+		name, create_mode, access_type, read_only, success);
+
+	/* Register psi value for the file */
+	register_pfs_file_open_end(locker, file,
+				   (*success == TRUE ? success : 0));
+
+	return(file);
+}
+
+/** NOTE! Please use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple_no_error_handling(). Add instrumentation to
+monitor file creation/open.
+@param[in]	key		Performance Schema Key
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
+				OS_FILE_READ_ALLOW_DELETE; the last option is
+				used by a backup program reading the file
+@param[in]	read_only	if true read only mode checks are enforced
+@param[out]	success		true if succeeded
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_simple_no_error_handling_func(
+	mysql_pfs_key_t key,
+	const char*	name,
+	ulint		create_mode,
+	ulint		access_type,
+	bool		read_only,
+	bool*		success,
+	const char*	src_file,
+	uint		src_line)
+{
+	PSI_file_locker_state	state;
+	struct PSI_file_locker* locker = NULL;
+
+	/* register a file open or creation depending on "create_mode" */
+	register_pfs_file_open_begin(
+		&state, locker, key,
+		create_mode == OS_FILE_CREATE
+		? PSI_FILE_CREATE : PSI_FILE_OPEN,
+		name, src_file, src_line);
+
+	pfs_os_file_t	file = os_file_create_simple_no_error_handling_func(
+		name, create_mode, access_type, read_only, success);
+
+	register_pfs_file_open_end(locker, file,
+				 (*success == TRUE ? success : 0));
+
+	return(file);
+}
+
+/** NOTE! Please use the corresponding macro os_file_create(), not directly
+this function!
+A performance schema wrapper function for os_file_create().
+Add instrumentation to monitor file creation/open.
+@param[in]	key		Performance Schema Key
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	purpose		OS_FILE_AIO, if asynchronous, non-buffered I/O
+				is desired, OS_FILE_NORMAL, if any normal file;
+				NOTE that it also depends on type, os_aio_..
+				and srv_.. variables whether we really us
+				async I/O or unbuffered I/O: look in the
+				function source code for the exact rules
+@param[in]	read_only	if true read only mode checks are enforced
+@param[out]	success		true if succeeded
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_func(
+	mysql_pfs_key_t key,
+	const char*	name,
+	ulint		create_mode,
+	ulint		purpose,
+	ulint		type,
+	bool		read_only,
+	bool*		success,
+	const char*	src_file,
+	uint		src_line)
+{
+	PSI_file_locker_state	state;
+	struct PSI_file_locker* locker = NULL;
+
+	/* register a file open or creation depending on "create_mode" */
+	register_pfs_file_open_begin(
+		&state, locker, key,
+		create_mode == OS_FILE_CREATE
+		? PSI_FILE_CREATE : PSI_FILE_OPEN,
+		name, src_file, src_line);
+
+	pfs_os_file_t	file = os_file_create_func(
+		name, create_mode, purpose, type, read_only, success);
+
+	register_pfs_file_open_end(locker, file,
+				(*success == TRUE ? success : 0));
+
+	return(file);
+}
+/**
+NOTE! Please use the corresponding macro os_file_close(), not directly
+this function!
+A performance schema instrumented wrapper function for os_file_close().
+@param[in]	file		handle to a file
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_close_func(
+	pfs_os_file_t	file,
+	const char*	src_file,
+	uint		src_line)
+{
+	PSI_file_locker_state	state;
+	struct PSI_file_locker*	locker = NULL;
+
+	/* register the file close */
+	register_pfs_file_io_begin(
+		&state, locker, file, 0, PSI_FILE_CLOSE, src_file, src_line);
+
+	bool	result = os_file_close_func(file);
+
+	register_pfs_file_io_end(locker, 0);
+
+	return(result);
+}
+
+/** NOTE! Please use the corresponding macro os_file_read(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_read() which requests a synchronous read operation.
+@param[in]	type		IO request context
+@param[in]	file		Open file handle
+@param[out]	buf		buffer where to read
+@param[in]	offset		file offset where to read
+@param[in]	n		number of bytes to read
+@param[out]	o		number of bytes actually read
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return DB_SUCCESS if request was successful */
+UNIV_INLINE
+dberr_t
+pfs_os_file_read_func(
+	const IORequest&	type,
+	pfs_os_file_t		file,
+	void*			buf,
+	os_offset_t		offset,
+	ulint			n,
+	ulint*			o,
+	const char*		src_file,
+	uint			src_line)
+{
+	PSI_file_locker_state	state;
+	struct PSI_file_locker*	locker = NULL;
+
+	register_pfs_file_io_begin(
+		&state, locker, file, n, PSI_FILE_READ, src_file, src_line);
+
+	dberr_t		result;
+
+	result = os_file_read_func(type, file, buf, offset, n, o);
+
+	register_pfs_file_io_end(locker, n);
+
+	return(result);
+}
+
+/** NOTE! Please use the corresponding macro os_file_write(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_write() which requests a synchronous write operation.
+@param[in]	type		IO request context
+@param[in]	name		Name of the file or path as NUL terminated
+				string
+@param[in]	file		Open file handle
+@param[out]	buf		buffer where to read
+@param[in]	offset		file offset where to read
+@param[in]	n		number of bytes to read
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return	error code
+@retval	DB_SUCCESS	if the request was successfully fulfilled */
+UNIV_INLINE
+dberr_t
+pfs_os_file_write_func(
+	const IORequest&	type,
+	const char*		name,
+	pfs_os_file_t		file,
+	const void*		buf,
+	os_offset_t		offset,
+	ulint			n,
+	const char*		src_file,
+	uint			src_line)
+{
+	PSI_file_locker_state	state;
+	struct PSI_file_locker*	locker = NULL;
+
+	register_pfs_file_io_begin(
+		&state, locker, file, n, PSI_FILE_WRITE, src_file, src_line);
+
+	dberr_t		result;
+
+	result = os_file_write_func(type, name, file, buf, offset, n);
+
+	register_pfs_file_io_end(locker, n);
+
+	return(result);
+}
+
+
+/** NOTE! Please use the corresponding macro os_file_flush(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_flush() which flushes the write buffers of a given file to the disk.
+Flushes the write buffers of a given file to the disk.
+@param[in]	file		Open file handle
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return TRUE if success */
+UNIV_INLINE
+bool
+pfs_os_file_flush_func(
+	pfs_os_file_t	file,
+	const char*	src_file,
+	uint		src_line)
+{
+	PSI_file_locker_state	state;
+	struct PSI_file_locker*	locker = NULL;
+
+	register_pfs_file_io_begin(
+		&state, locker, file, 0, PSI_FILE_SYNC, src_file, src_line);
+
+	bool	result = os_file_flush_func(file);
+
+	register_pfs_file_io_end(locker, 0);
+
+	return(result);
+}
+
+/** NOTE! Please use the corresponding macro os_file_rename(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_rename()
+@param[in]	key		Performance Schema Key
+@param[in]	oldpath		old file path as a null-terminated string
+@param[in]	newpath		new file path
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_rename_func(
+	mysql_pfs_key_t	key,
+	const char*	oldpath,
+	const char*	newpath,
+	const char*	src_file,
+	uint		src_line)
+
+{
+	PSI_file_locker_state	state;
+	struct PSI_file_locker*	locker = NULL;
+
+	register_pfs_file_rename_begin(
+		&state, locker, key, PSI_FILE_RENAME, newpath,
+		src_file, src_line);
+
+	bool	result = os_file_rename_func(oldpath, newpath);
+
+	register_pfs_file_rename_end(locker, oldpath, newpath, !result);
+
+	return(result);
+}
+
+/** NOTE! Please use the corresponding macro os_file_delete(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete()
+@param[in]	key		Performance Schema Key
+@param[in]	name		old file path as a null-terminated string
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_func(
+	mysql_pfs_key_t	key,
+	const char*	name,
+	const char*	src_file,
+	uint		src_line)
+{
+	PSI_file_locker_state	state;
+	struct PSI_file_locker*	locker = NULL;
+
+	register_pfs_file_close_begin(
+		&state, locker, key, PSI_FILE_DELETE, name, src_file, src_line);
+
+	bool	result = os_file_delete_func(name);
+
+	register_pfs_file_close_end(locker, 0);
+
+	return(result);
+}
+
+/**
+NOTE! Please use the corresponding macro os_file_delete_if_exists(), not
+directly this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete_if_exists()
+@param[in]	key		Performance Schema Key
+@param[in]	name		old file path as a null-terminated string
+@param[in]	exist		indicate if file pre-exist
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_if_exists_func(
+	mysql_pfs_key_t	key,
+	const char*	name,
+	bool*		exist,
+	const char*	src_file,
+	uint		src_line)
+{
+	PSI_file_locker_state	state;
+	struct PSI_file_locker*	locker = NULL;
+
+	register_pfs_file_close_begin(
+		&state, locker, key, PSI_FILE_DELETE, name, src_file, src_line);
+
+	bool	result = os_file_delete_if_exists_func(name, exist);
+
+	register_pfs_file_close_end(locker, 0);
+
+	return(result);
+}
+#endif /* UNIV_PFS_IO */
diff --git a/storage/innobase/include/page0cur.h b/storage/innobase/include/page0cur.h
new file mode 100644
index 00000000..28aa3056
--- /dev/null
+++ b/storage/innobase/include/page0cur.h
@@ -0,0 +1,303 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/page0cur.h
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef page0cur_h
+#define page0cur_h
+
+#include "page0page.h"
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_t*
+page_cur_get_page(
+/*==============*/
+	page_cur_t*	cur);	/*!< in: page cursor */
+/*********************************************************//**
+Gets pointer to the buffer block where the cursor is positioned.
+@return page */
+UNIV_INLINE
+buf_block_t*
+page_cur_get_block(
+/*===============*/
+	page_cur_t*	cur);	/*!< in: page cursor */
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_zip_des_t*
+page_cur_get_page_zip(
+/*==================*/
+	page_cur_t*	cur);	/*!< in: page cursor */
+/* Gets the record where the cursor is positioned.
+@param cur page cursor
+@return record */
+UNIV_INLINE
+rec_t *page_cur_get_rec(const page_cur_t *cur);
+#else /* UNIV_DEBUG */
+# define page_cur_get_page(cur)		page_align((cur)->rec)
+# define page_cur_get_block(cur)	(cur)->block
+# define page_cur_get_page_zip(cur)	buf_block_get_page_zip((cur)->block)
+# define page_cur_get_rec(cur)		(cur)->rec
+#endif /* UNIV_DEBUG */
+# define is_page_cur_get_page_zip(cur)	is_buf_block_get_page_zip((cur)->block)
+/*********************************************************//**
+Sets the cursor object to point before the first user record
+on the page. */
+UNIV_INLINE
+void
+page_cur_set_before_first(
+/*======================*/
+	const buf_block_t*	block,	/*!< in: index page */
+	page_cur_t*		cur);	/*!< in: cursor */
+/*********************************************************//**
+Sets the cursor object to point after the last user record on
+the page. */
+UNIV_INLINE
+void
+page_cur_set_after_last(
+/*====================*/
+	const buf_block_t*	block,	/*!< in: index page */
+	page_cur_t*		cur);	/*!< in: cursor */
+/*********************************************************//**
+Returns TRUE if the cursor is before first user record on page.
+@return TRUE if at start */
+UNIV_INLINE
+ibool
+page_cur_is_before_first(
+/*=====================*/
+	const page_cur_t*	cur);	/*!< in: cursor */
+/*********************************************************//**
+Returns TRUE if the cursor is after last user record.
+@return TRUE if at end */
+UNIV_INLINE
+ibool
+page_cur_is_after_last(
+/*===================*/
+	const page_cur_t*	cur);	/*!< in: cursor */
+/**********************************************************//**
+Positions the cursor on the given record. */
+UNIV_INLINE
+void
+page_cur_position(
+/*==============*/
+	const rec_t*		rec,	/*!< in: record on a page */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	page_cur_t*		cur);	/*!< out: page cursor */
+
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_tuple_insert(
+/*==================*/
+	page_cur_t*	cursor,	/*!< in/out: a page cursor */
+	const dtuple_t*	tuple,	/*!< in: pointer to a data tuple */
+	rec_offs**	offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************//**
+Inserts a record next to page cursor on an uncompressed page.
+@return pointer to record
+@retval nullptr if not enough space was available */
+rec_t*
+page_cur_insert_rec_low(
+/*====================*/
+	const page_cur_t*cur,	/*!< in: page cursor */
+	const rec_t*	rec,	/*!< in: record to insert after cur */
+	rec_offs*	offsets,/*!< in/out: rec_get_offsets(rec, index) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/***********************************************************//**
+Inserts a record next to page cursor on a compressed and uncompressed
+page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to inserted record
+@return nullptr on failure */
+rec_t*
+page_cur_insert_rec_zip(
+/*====================*/
+	page_cur_t*	cursor,	/*!< in/out: page cursor,
+				logical position unchanged  */
+	const rec_t*	rec,	/*!< in: pointer to a physical record */
+	rec_offs*	offsets,/*!< in/out: rec_get_offsets(rec, index) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************//**
+Deletes a record at the page cursor. The cursor is moved to the
+next record after the deleted one. */
+void
+page_cur_delete_rec(
+/*================*/
+	page_cur_t*		cursor,	/*!< in/out: a page cursor */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(
+					cursor->rec, index) */
+	mtr_t*			mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull));
+
+/** Apply a INSERT_HEAP_REDUNDANT or INSERT_REUSE_REDUNDANT record that was
+written by page_cur_insert_rec_low() for a ROW_FORMAT=REDUNDANT page.
+@param block      B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param reuse      false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev       byte offset of the predecessor, relative to PAGE_OLD_INFIMUM
+@param enc_hdr    encoded fixed-size header bits
+@param hdr_c      number of common record header bytes with prev
+@param data_c     number of common data bytes with prev
+@param data       literal header and data bytes
+@param data_len   length of the literal data, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_insert_redundant(const buf_block_t &block, bool reuse,
+                                 ulint prev, ulint enc_hdr,
+                                 size_t hdr_c, size_t data_c,
+                                 const void *data, size_t data_len);
+
+/** Apply a INSERT_HEAP_DYNAMIC or INSERT_REUSE_DYNAMIC record that was
+written by page_cur_insert_rec_low() for a ROW_FORMAT=COMPACT or DYNAMIC page.
+@param block      B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param reuse      false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev       byte offset of the predecessor, relative to PAGE_NEW_INFIMUM
+@param shift      unless !reuse: number of bytes the PAGE_FREE is moving
+@param enc_hdr_l  number of copied record header bytes, plus record type bits
+@param hdr_c      number of common record header bytes with prev
+@param data_c     number of common data bytes with prev
+@param data       literal header and data bytes
+@param data_len   length of the literal data, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_insert_dynamic(const buf_block_t &block, bool reuse,
+                               ulint prev, ulint shift, ulint enc_hdr_l,
+                               size_t hdr_c, size_t data_c,
+                               const void *data, size_t data_len);
+
+/** Apply a DELETE_ROW_FORMAT_REDUNDANT record that was written by
+page_cur_delete_rec() for a ROW_FORMAT=REDUNDANT page.
+@param block    B-tree or R-tree page in ROW_FORMAT=REDUNDANT
+@param prev     byte offset of the predecessor, relative to PAGE_OLD_INFIMUM
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_delete_redundant(const buf_block_t &block, ulint prev);
+
+/** Apply a DELETE_ROW_FORMAT_DYNAMIC record that was written by
+page_cur_delete_rec() for a ROW_FORMAT=COMPACT or DYNAMIC page.
+@param block      B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param prev       byte offset of the predecessor, relative to PAGE_NEW_INFIMUM
+@param hdr_size   record header size, excluding REC_N_NEW_EXTRA_BYTES
+@param data_size  data payload size, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_delete_dynamic(const buf_block_t &block, ulint prev,
+                               size_t hdr_size, size_t data_size);
+
+MY_ATTRIBUTE((warn_unused_result))
+/****************************************************************//**
+Searches the right position for a page cursor. */
+bool
+page_cur_search_with_match(
+/*=======================*/
+	const dtuple_t*		tuple,	/*!< in: data tuple */
+	page_cur_mode_t		mode,	/*!< in: PAGE_CUR_L,
+					PAGE_CUR_LE, PAGE_CUR_G, or
+					PAGE_CUR_GE */
+	ulint*			iup_matched_fields,
+					/*!< in/out: already matched
+					fields in upper limit record */
+	ulint*			ilow_matched_fields,
+					/*!< in/out: already matched
+					fields in lower limit record */
+	page_cur_t*		cursor,	/*!< in/out: page cursor */
+	rtr_info_t*		rtr_info);/*!< in/out: rtree search stack */
+#ifdef BTR_CUR_HASH_ADAPT
+MY_ATTRIBUTE((warn_unused_result))
+/** Search the right position for a page cursor.
+@param[in]	tuple			key to be searched for
+@param[in]	mode			search mode
+@param[in,out]	iup_matched_fields	already matched fields in the
+upper limit record
+@param[in,out]	iup_matched_bytes	already matched bytes in the
+first partially matched field in the upper limit record
+@param[in,out]	ilow_matched_fields	already matched fields in the
+lower limit record
+@param[in,out]	ilow_matched_bytes	already matched bytes in the
+first partially matched field in the lower limit record
+@param[in,out]	cursor			page cursor */
+bool
+page_cur_search_with_match_bytes(
+	const dtuple_t*		tuple,
+	page_cur_mode_t		mode,
+	ulint*			iup_matched_fields,
+	ulint*			iup_matched_bytes,
+	ulint*			ilow_matched_fields,
+	ulint*			ilow_matched_bytes,
+	page_cur_t*		cursor);
+#endif /* BTR_CUR_HASH_ADAPT */
+/***********************************************************//**
+Positions a page cursor on a randomly chosen user record on a page. If there
+are no user records, sets the cursor on the infimum record. */
+void page_cur_open_on_rnd_user_rec(page_cur_t *cursor);
+
+/** Index page cursor */
+
+struct page_cur_t{
+	dict_index_t*	index;
+	rec_t*		rec;	/*!< pointer to a record on page */
+	rec_offs*	offsets;
+	buf_block_t*	block;	/*!< pointer to the block containing rec */
+};
+
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline rec_t *page_cur_move_to_next(page_cur_t *cur)
+{
+  return cur->rec= page_rec_get_next(cur->rec);
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline rec_t *page_cur_move_to_prev(page_cur_t *cur)
+{
+  return cur->rec= page_rec_get_prev(cur->rec);
+}
+
+#include "page0cur.inl"
+
+#endif
diff --git a/storage/innobase/include/page0cur.inl b/storage/innobase/include/page0cur.inl
new file mode 100644
index 00000000..7c4eafa2
--- /dev/null
+++ b/storage/innobase/include/page0cur.inl
@@ -0,0 +1,203 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/page0cur.ic
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_t*
+page_cur_get_page(
+/*==============*/
+	page_cur_t*	cur)	/*!< in: page cursor */
+{
+  return page_align(page_cur_get_rec(cur));
+}
+
+/*********************************************************//**
+Gets pointer to the buffer block where the cursor is positioned.
+@return page */
+UNIV_INLINE
+buf_block_t*
+page_cur_get_block(
+/*===============*/
+	page_cur_t*	cur)	/*!< in: page cursor */
+{
+  ut_ad(cur);
+  ut_ad(!cur->rec || page_align(cur->rec) == cur->block->page.frame);
+  return cur->block;
+}
+
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_zip_des_t*
+page_cur_get_page_zip(
+/*==================*/
+	page_cur_t*	cur)	/*!< in: page cursor */
+{
+	return(buf_block_get_page_zip(page_cur_get_block(cur)));
+}
+
+/* Gets the record where the cursor is positioned.
+@param cur page cursor
+@return record */
+UNIV_INLINE
+rec_t *page_cur_get_rec(const page_cur_t *cur)
+{
+  ut_ad(cur);
+  ut_ad(!cur->rec || page_align(cur->rec) == cur->block->page.frame);
+  return cur->rec;
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************//**
+Sets the cursor object to point before the first user record
+on the page. */
+UNIV_INLINE
+void
+page_cur_set_before_first(
+/*======================*/
+	const buf_block_t*	block,	/*!< in: index page */
+	page_cur_t*		cur)	/*!< in: cursor */
+{
+	cur->block = const_cast<buf_block_t*>(block);
+	cur->rec = page_get_infimum_rec(buf_block_get_frame(cur->block));
+}
+
+/*********************************************************//**
+Sets the cursor object to point after the last user record on
+the page. */
+UNIV_INLINE
+void
+page_cur_set_after_last(
+/*====================*/
+	const buf_block_t*	block,	/*!< in: index page */
+	page_cur_t*		cur)	/*!< in: cursor */
+{
+	cur->block = const_cast<buf_block_t*>(block);
+	cur->rec = page_get_supremum_rec(buf_block_get_frame(cur->block));
+}
+
+/*********************************************************//**
+Returns TRUE if the cursor is before first user record on page.
+@return TRUE if at start */
+UNIV_INLINE
+ibool
+page_cur_is_before_first(
+/*=====================*/
+	const page_cur_t*	cur)	/*!< in: cursor */
+{
+	ut_ad(cur);
+	ut_ad(page_align(cur->rec) == cur->block->page.frame);
+	return(page_rec_is_infimum(cur->rec));
+}
+
+/*********************************************************//**
+Returns TRUE if the cursor is after last user record.
+@return TRUE if at end */
+UNIV_INLINE
+ibool
+page_cur_is_after_last(
+/*===================*/
+	const page_cur_t*	cur)	/*!< in: cursor */
+{
+	ut_ad(cur);
+	ut_ad(page_align(cur->rec) == cur->block->page.frame);
+	return(page_rec_is_supremum(cur->rec));
+}
+
+/**********************************************************//**
+Positions the cursor on the given record. */
+UNIV_INLINE
+void
+page_cur_position(
+/*==============*/
+	const rec_t*		rec,	/*!< in: record on a page */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	page_cur_t*		cur)	/*!< out: page cursor */
+{
+	ut_ad(rec && block && cur);
+	ut_ad(page_align(rec) == block->page.frame);
+
+	cur->rec = (rec_t*) rec;
+	cur->block = (buf_block_t*) block;
+}
+
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_tuple_insert(
+/*==================*/
+	page_cur_t*	cursor,	/*!< in/out: a page cursor */
+	const dtuple_t*	tuple,	/*!< in: pointer to a data tuple */
+	rec_offs**	offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint size = rec_get_converted_size(cursor->index, tuple, n_ext);
+
+	if (!*heap) {
+		*heap = mem_heap_create(size
+					+ (4 + REC_OFFS_HEADER_SIZE
+					   + dtuple_get_n_fields(tuple))
+					* sizeof **offsets);
+	}
+
+	rec_t* rec = rec_convert_dtuple_to_rec(
+		static_cast<byte*>(mem_heap_alloc(*heap, size)),
+		cursor->index, tuple, n_ext);
+
+	*offsets = rec_get_offsets(rec, cursor->index, *offsets,
+				   page_is_leaf(cursor->block->page.frame)
+				   ? cursor->index->n_core_fields : 0,
+				   ULINT_UNDEFINED, heap);
+	ut_ad(size == rec_offs_size(*offsets));
+
+	if (is_buf_block_get_page_zip(cursor->block)) {
+		rec = page_cur_insert_rec_zip(cursor, rec, *offsets, mtr);
+	} else {
+		rec = page_cur_insert_rec_low(cursor, rec, *offsets, mtr);
+	}
+
+	ut_ad(!rec || !cmp_dtuple_rec(tuple, rec, cursor->index, *offsets));
+	return(rec);
+}
+
diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h
new file mode 100644
index 00000000..2978656b
--- /dev/null
+++ b/storage/innobase/include/page0page.h
@@ -0,0 +1,1101 @@
+/*****************************************************************************
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0page.h
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef page0page_h
+#define page0page_h
+
+#include "page0types.h"
+#include "fsp0fsp.h"
+#include "fil0fil.h"
+#include "buf0buf.h"
+#include "rem0rec.h"
+#include "mach0data.h"
+#ifndef UNIV_INNOCHECKSUM
+#include "dict0dict.h"
+#include "data0data.h"
+#include "mtr0mtr.h"
+
+/*			PAGE HEADER
+			===========
+
+Index page header starts at the first offset left free by the FIL-module */
+
+typedef	byte		page_header_t;
+#endif /* !UNIV_INNOCHECKSUM */
+
+#define	PAGE_HEADER	FSEG_PAGE_DATA	/* index page header starts at this
+				offset */
+/*-----------------------------*/
+#define PAGE_N_DIR_SLOTS 0	/* number of slots in page directory */
+#define	PAGE_HEAP_TOP	 2	/* pointer to record heap top */
+#define	PAGE_N_HEAP	 4	/* number of records in the heap,
+				bit 15=flag: new-style compact page format */
+#define	PAGE_FREE	 6	/* pointer to start of page free record list */
+#define	PAGE_GARBAGE	 8	/* number of bytes in deleted records */
+#define	PAGE_LAST_INSERT 10	/* pointer to the last inserted record, or
+				0 if this info has been reset by a delete,
+				for example */
+
+/** This 10-bit field is usually 0. In B-tree index pages of
+ROW_FORMAT=REDUNDANT tables, this byte can contain garbage if the .ibd
+file was created in MySQL 4.1.0 or if the table resides in the system
+tablespace and was created before MySQL 4.1.1 or MySQL 4.0.14.
+In this case, the FIL_PAGE_TYPE would be FIL_PAGE_INDEX.
+
+In ROW_FORMAT=COMPRESSED tables, this field is always 0, because
+instant ADD COLUMN is not supported.
+
+In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC tables, this field is
+always 0, except in the root page of the clustered index after instant
+ADD COLUMN.
+
+Instant ADD COLUMN will change FIL_PAGE_TYPE to FIL_PAGE_TYPE_INSTANT
+and initialize the PAGE_INSTANT field to the original number of
+fields in the clustered index (dict_index_t::n_core_fields).  The most
+significant bits are in the first byte, and the least significant 5
+bits are stored in the most significant 5 bits of PAGE_DIRECTION_B.
+
+These FIL_PAGE_TYPE_INSTANT and PAGE_INSTANT may be assigned even if
+instant ADD COLUMN was not committed. Changes to these page header fields
+are not undo-logged, but changes to the hidden metadata record are.
+If the server is killed and restarted, the page header fields could
+remain set even though no metadata record is present.
+
+When the table becomes empty, the PAGE_INSTANT field and the
+FIL_PAGE_TYPE can be reset and any metadata record be removed. */
+#define PAGE_INSTANT	12
+
+/** last insert direction: PAGE_LEFT, ....
+In ROW_FORMAT=REDUNDANT tables created before MySQL 4.1.1 or MySQL 4.0.14,
+this byte can be garbage. */
+#define	PAGE_DIRECTION_B 13
+#define	PAGE_N_DIRECTION 14	/* number of consecutive inserts to the same
+				direction */
+#define	PAGE_N_RECS	 16	/* number of user records on the page */
+/** The largest DB_TRX_ID that may have modified a record on the page;
+Defined only in secondary index leaf pages and in change buffer leaf pages.
+Otherwise written as 0. @see PAGE_ROOT_AUTO_INC */
+#define PAGE_MAX_TRX_ID	 18
+/** The AUTO_INCREMENT value (on persistent clustered index root pages). */
+#define PAGE_ROOT_AUTO_INC	PAGE_MAX_TRX_ID
+#define PAGE_HEADER_PRIV_END 26	/* end of private data structure of the page
+				header which are set in a page create */
+/*----*/
+#define	PAGE_LEVEL	 26	/* level of the node in an index tree; the
+				leaf level is the level 0.  This field should
+				not be written to after page creation. */
+#define	PAGE_INDEX_ID	 28	/* index id where the page belongs.
+				This field should not be written to after
+				page creation. */
+
+#define PAGE_BTR_SEG_LEAF 36	/* file segment header for the leaf pages in
+				a B-tree: defined only on the root page of a
+				B-tree, but not in the root of an ibuf tree */
+#define PAGE_BTR_IBUF_FREE_LIST	PAGE_BTR_SEG_LEAF
+#define PAGE_BTR_IBUF_FREE_LIST_NODE PAGE_BTR_SEG_LEAF
+				/* in the place of PAGE_BTR_SEG_LEAF and _TOP
+				there is a free list base node if the page is
+				the root page of an ibuf tree, and at the same
+				place is the free list node if the page is in
+				a free list */
+#define PAGE_BTR_SEG_TOP (36 + FSEG_HEADER_SIZE)
+				/* file segment header for the non-leaf pages
+				in a B-tree: defined only on the root page of
+				a B-tree, but not in the root of an ibuf
+				tree */
+/*----*/
+#define PAGE_DATA	(PAGE_HEADER + 36 + 2 * FSEG_HEADER_SIZE)
+				/* start of data on the page */
+
+#define PAGE_OLD_INFIMUM	(PAGE_DATA + 1 + REC_N_OLD_EXTRA_BYTES)
+				/* offset of the page infimum record on an
+				old-style page */
+#define PAGE_OLD_SUPREMUM	(PAGE_DATA + 2 + 2 * REC_N_OLD_EXTRA_BYTES + 8)
+				/* offset of the page supremum record on an
+				old-style page */
+#define PAGE_OLD_SUPREMUM_END (PAGE_OLD_SUPREMUM + 9)
+				/* offset of the page supremum record end on
+				an old-style page */
+#define PAGE_NEW_INFIMUM	(PAGE_DATA + REC_N_NEW_EXTRA_BYTES)
+				/* offset of the page infimum record on a
+				new-style compact page */
+#define PAGE_NEW_SUPREMUM	(PAGE_DATA + 2 * REC_N_NEW_EXTRA_BYTES + 8)
+				/* offset of the page supremum record on a
+				new-style compact page */
+#define PAGE_NEW_SUPREMUM_END (PAGE_NEW_SUPREMUM + 8)
+				/* offset of the page supremum record end on
+				a new-style compact page */
+/*-----------------------------*/
+
+/* Heap numbers */
+#define PAGE_HEAP_NO_INFIMUM	0U	/* page infimum */
+#define PAGE_HEAP_NO_SUPREMUM	1U	/* page supremum */
+#define PAGE_HEAP_NO_USER_LOW	2U	/* first user record in
+					creation (insertion) order,
+					not necessarily collation order;
+					this record may have been deleted */
+
+/* Directions of cursor movement (stored in PAGE_DIRECTION field) */
+constexpr uint16_t PAGE_LEFT= 1;
+constexpr uint16_t PAGE_RIGHT= 2;
+constexpr uint16_t PAGE_SAME_REC= 3;
+constexpr uint16_t PAGE_SAME_PAGE= 4;
+constexpr uint16_t PAGE_NO_DIRECTION= 5;
+
+#ifndef UNIV_INNOCHECKSUM
+
+/*			PAGE DIRECTORY
+			==============
+*/
+
+typedef	byte			page_dir_slot_t;
+
+/* Offset of the directory start down from the page end. We call the
+slot with the highest file address directory start, as it points to
+the first record in the list of records. */
+#define	PAGE_DIR		FIL_PAGE_DATA_END
+
+/* We define a slot in the page directory as two bytes */
+constexpr uint16_t PAGE_DIR_SLOT_SIZE= 2;
+
+/* The offset of the physically lower end of the directory, counted from
+page end, when the page is empty */
+#define PAGE_EMPTY_DIR_START	(PAGE_DIR + 2 * PAGE_DIR_SLOT_SIZE)
+
+/* The maximum and minimum number of records owned by a directory slot. The
+number may drop below the minimum in the first and the last slot in the
+directory. */
+#define PAGE_DIR_SLOT_MAX_N_OWNED	8
+#define	PAGE_DIR_SLOT_MIN_N_OWNED	4
+
+extern my_bool srv_immediate_scrub_data_uncompressed;
+#endif /* UNIV_INNOCHECKSUM */
+
+/** Get the start of a page frame.
+@param[in]	ptr	pointer within a page frame
+@return start of the page frame */
+MY_ATTRIBUTE((const))
+inline page_t* page_align(void *ptr)
+{
+  return my_assume_aligned<UNIV_PAGE_SIZE_MIN>
+    (reinterpret_cast<page_t*>(ut_align_down(ptr, srv_page_size)));
+}
+inline const page_t *page_align(const void *ptr)
+{
+  return page_align(const_cast<void*>(ptr));
+}
+
+/** Gets the byte offset within a page frame.
+@param[in]	ptr	pointer within a page frame
+@return offset from the start of the page */
+MY_ATTRIBUTE((const))
+inline uint16_t page_offset(const void*	ptr)
+{
+  return static_cast<uint16_t>(ut_align_offset(ptr, srv_page_size));
+}
+
+/** Determine whether an index page is not in ROW_FORMAT=REDUNDANT.
+@param[in]	page	index page
+@return	nonzero	if ROW_FORMAT is one of COMPACT,DYNAMIC,COMPRESSED
+@retval	0	if ROW_FORMAT=REDUNDANT */
+inline
+byte
+page_is_comp(const page_t* page)
+{
+	ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN));
+	return(page[PAGE_HEADER + PAGE_N_HEAP] & 0x80);
+}
+
+/** Determine whether an index page is empty.
+@param[in]	page	index page
+@return whether the page is empty (PAGE_N_RECS = 0) */
+inline
+bool
+page_is_empty(const page_t* page)
+{
+	ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN));
+	return !*reinterpret_cast<const uint16_t*>(PAGE_HEADER + PAGE_N_RECS
+						   + page);
+}
+
+/** Determine whether an index page contains garbage.
+@param[in]	page	index page
+@return whether the page contains garbage (PAGE_GARBAGE is not 0) */
+inline
+bool
+page_has_garbage(const page_t* page)
+{
+	ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN));
+	return *reinterpret_cast<const uint16_t*>(PAGE_HEADER + PAGE_GARBAGE
+						  + page);
+}
+
+/** Determine whether an B-tree or R-tree index page is a leaf page.
+@param[in]	page	index page
+@return true if the page is a leaf (PAGE_LEVEL = 0) */
+inline
+bool
+page_is_leaf(const page_t* page)
+{
+	ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN));
+	return !*reinterpret_cast<const uint16_t*>(PAGE_HEADER + PAGE_LEVEL
+						   + page);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/** Determine whether an index page record is not in ROW_FORMAT=REDUNDANT.
+@param[in]	rec	record in an index page frame (not a copy)
+@return	nonzero	if ROW_FORMAT is one of COMPACT,DYNAMIC,COMPRESSED
+@retval	0	if ROW_FORMAT=REDUNDANT */
+inline
+byte
+page_rec_is_comp(const byte* rec)
+{
+	return(page_is_comp(page_align(rec)));
+}
+
+# ifdef UNIV_DEBUG
+/** Determine if the record is the metadata pseudo-record
+in the clustered index.
+@param[in]	rec	leaf page record on an index page
+@return	whether the record is the metadata pseudo-record */
+inline bool page_rec_is_metadata(const rec_t* rec)
+{
+	return rec_get_info_bits(rec, page_rec_is_comp(rec))
+		& REC_INFO_MIN_REC_FLAG;
+}
+# endif /* UNIV_DEBUG */
+
+/** Determine the offset of the infimum record on the page.
+@param[in]	page	index page
+@return offset of the infimum record in record list, relative from page */
+inline
+unsigned
+page_get_infimum_offset(const page_t* page)
+{
+	ut_ad(!page_offset(page));
+	return page_is_comp(page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM;
+}
+
+/** Determine the offset of the supremum record on the page.
+@param[in]	page	index page
+@return offset of the supremum record in record list, relative from page */
+inline
+unsigned
+page_get_supremum_offset(const page_t* page)
+{
+	ut_ad(!page_offset(page));
+	return page_is_comp(page) ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM;
+}
+
+/** Determine whether an index page record is a user record.
+@param[in]	offset	record offset in the page
+@retval true if a user record
+@retval	false if the infimum or supremum pseudo-record */
+inline
+bool
+page_rec_is_user_rec_low(ulint offset)
+{
+	compile_time_assert(PAGE_OLD_INFIMUM >= PAGE_NEW_INFIMUM);
+	compile_time_assert(PAGE_OLD_SUPREMUM >= PAGE_NEW_SUPREMUM);
+	compile_time_assert(PAGE_NEW_INFIMUM < PAGE_OLD_SUPREMUM);
+	compile_time_assert(PAGE_OLD_INFIMUM < PAGE_NEW_SUPREMUM);
+	compile_time_assert(PAGE_NEW_SUPREMUM < PAGE_OLD_SUPREMUM_END);
+	compile_time_assert(PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM_END);
+	ut_ad(offset >= PAGE_NEW_INFIMUM);
+	ut_ad(offset <= srv_page_size - PAGE_EMPTY_DIR_START);
+
+	return(offset != PAGE_NEW_SUPREMUM
+	       && offset != PAGE_NEW_INFIMUM
+	       && offset != PAGE_OLD_INFIMUM
+	       && offset != PAGE_OLD_SUPREMUM);
+}
+
+/** Determine if a record is the supremum record on an index page.
+@param[in]	offset	record offset in an index page
+@return true if the supremum record */
+inline
+bool
+page_rec_is_supremum_low(ulint offset)
+{
+	ut_ad(offset >= PAGE_NEW_INFIMUM);
+	ut_ad(offset <= srv_page_size - PAGE_EMPTY_DIR_START);
+	return(offset == PAGE_NEW_SUPREMUM || offset == PAGE_OLD_SUPREMUM);
+}
+
+/** Determine if a record is the infimum record on an index page.
+@param[in]	offset	record offset in an index page
+@return true if the infimum record */
+inline
+bool
+page_rec_is_infimum_low(ulint offset)
+{
+	ut_ad(offset >= PAGE_NEW_INFIMUM);
+	ut_ad(offset <= srv_page_size - PAGE_EMPTY_DIR_START);
+	return(offset == PAGE_NEW_INFIMUM || offset == PAGE_OLD_INFIMUM);
+}
+
+/** Determine whether an B-tree or R-tree index record is in a leaf page.
+@param[in]	rec	index record in an index page
+@return true if the record is in a leaf page */
+inline
+bool
+page_rec_is_leaf(const page_t* rec)
+{
+	const page_t* page = page_align(rec);
+	ut_ad(ulint(rec - page) >= page_get_infimum_offset(page));
+	bool leaf = page_is_leaf(page);
+	ut_ad(!page_rec_is_comp(rec)
+	      || !page_rec_is_user_rec_low(ulint(rec - page))
+	      || leaf == !rec_get_node_ptr_flag(rec));
+	return leaf;
+}
+
+/** Determine whether an index page record is a user record.
+@param[in]	rec	record in an index page
+@return true if a user record */
+inline
+bool
+page_rec_is_user_rec(const rec_t* rec);
+
+/** Determine whether an index page record is the supremum record.
+@param[in]	rec	record in an index page
+@return true if the supremum record */
+inline
+bool
+page_rec_is_supremum(const rec_t* rec);
+
+/** Determine whether an index page record is the infimum record.
+@param[in]	rec	record in an index page
+@return true if the infimum record */
+inline
+bool
+page_rec_is_infimum(const rec_t* rec);
+
+/** Read PAGE_MAX_TRX_ID.
+@param[in]      page    index page
+@return the value of PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline trx_id_t page_get_max_trx_id(const page_t *page)
+{
+  ut_ad(fil_page_index_page_check(page));
+  static_assert((PAGE_HEADER + PAGE_MAX_TRX_ID) % 8 == 0, "alignment");
+  const auto *p= my_assume_aligned<8>(page + PAGE_HEADER + PAGE_MAX_TRX_ID);
+  return mach_read_from_8(p);
+}
+
+/**
+Set the number of owned records.
+@tparam compressed    whether to update any ROW_FORMAT=COMPRESSED page as well
+@param[in,out]  block   index page
+@param[in,out]  rec     record in block.frame
+@param[in]      n_owned number of records skipped in the sparse page directory
+@param[in]      comp    whether ROW_FORMAT is one of COMPACT,DYNAMIC,COMPRESSED
+@param[in,out]  mtr     mini-transaction */
+template<bool compressed>
+inline void page_rec_set_n_owned(buf_block_t *block, rec_t *rec, ulint n_owned,
+                                 bool comp, mtr_t *mtr)
+{
+  ut_ad(block->page.frame == page_align(rec));
+  ut_ad(comp == (page_is_comp(block->page.frame) != 0));
+
+  if (page_zip_des_t *page_zip= compressed
+      ? buf_block_get_page_zip(block) : nullptr)
+  {
+    ut_ad(comp);
+    rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED,
+                        REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+    if (rec_get_status(rec) != REC_STATUS_SUPREMUM)
+      page_zip_rec_set_owned(block, rec, n_owned, mtr);
+  }
+  else
+  {
+    rec-= comp ? REC_NEW_N_OWNED : REC_OLD_N_OWNED;
+    mtr->write<1,mtr_t::MAYBE_NOP>(*block, rec, (*rec & ~REC_N_OWNED_MASK) |
+                                   (n_owned << REC_N_OWNED_SHIFT));
+  }
+}
+
+/*************************************************************//**
+Sets the max trx id field value. */
+void
+page_set_max_trx_id(
+/*================*/
+	buf_block_t*	block,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction, or NULL */
+/*************************************************************//**
+Sets the max trx id field value if trx_id is bigger than the previous
+value. */
+UNIV_INLINE
+void
+page_update_max_trx_id(
+/*===================*/
+	buf_block_t*	block,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+
+/** Persist the AUTO_INCREMENT value on a clustered index root page.
+@param[in,out]	block	clustered index root page
+@param[in]	autoinc	next available AUTO_INCREMENT value
+@param[in,out]	mtr	mini-transaction
+@param[in]	reset	whether to reset the AUTO_INCREMENT
+			to a possibly smaller value than currently
+			exists in the page */
+void
+page_set_autoinc(
+	buf_block_t*		block,
+	ib_uint64_t		autoinc,
+	mtr_t*			mtr,
+	bool			reset)
+	MY_ATTRIBUTE((nonnull));
+
+/*************************************************************//**
+Returns the RTREE SPLIT SEQUENCE NUMBER (FIL_RTREE_SPLIT_SEQ_NUM).
+@return SPLIT SEQUENCE NUMBER */
+UNIV_INLINE
+node_seq_t
+page_get_ssn_id(
+/*============*/
+	const page_t*	page);	/*!< in: page */
+/*************************************************************//**
+Sets the RTREE SPLIT SEQUENCE NUMBER field value */
+UNIV_INLINE
+void
+page_set_ssn_id(
+/*============*/
+	buf_block_t*	block,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	node_seq_t	ssn_id,	/*!< in: split sequence id */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+
+#endif /* !UNIV_INNOCHECKSUM */
+/** Read a page header field. */
+inline uint16_t page_header_get_field(const page_t *page, ulint field)
+{
+  ut_ad(field <= PAGE_INDEX_ID);
+  ut_ad(!(field & 1));
+  return mach_read_from_2(my_assume_aligned<2>(PAGE_HEADER + field + page));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/*************************************************************//**
+Returns the offset stored in the given header field.
+@return offset from the start of the page, or 0 */
+UNIV_INLINE
+uint16_t
+page_header_get_offs(
+/*=================*/
+	const page_t*	page,	/*!< in: page */
+	ulint		field)	/*!< in: PAGE_FREE, ... */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*************************************************************//**
+Returns the pointer stored in the given header field, or NULL. */
+#define page_header_get_ptr(page, field)			\
+	(page_header_get_offs(page, field)			\
+	 ? page + page_header_get_offs(page, field) : NULL)
+
+/**
+Reset PAGE_LAST_INSERT.
+@param[in,out]  block    file page
+@param[in,out]  mtr      mini-transaction */
+inline void page_header_reset_last_insert(buf_block_t *block, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
+#define page_get_infimum_rec(page) ((page) + page_get_infimum_offset(page))
+#define page_get_supremum_rec(page) ((page) + page_get_supremum_offset(page))
+
+/************************************************************//**
+Returns the nth record of the record list.
+This is the inverse function of page_rec_get_n_recs_before().
+@return nth record
+@retval nullptr on corrupted page */
+const rec_t*
+page_rec_get_nth_const(
+/*===================*/
+	const page_t*	page,	/*!< in: page */
+	ulint		nth)	/*!< in: nth record */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/************************************************************//**
+Returns the nth record of the record list.
+This is the inverse function of page_rec_get_n_recs_before().
+@return nth record
+@retval nullptr on corrupted page */
+inline rec_t *page_rec_get_nth(page_t* page, ulint nth)
+{
+  return const_cast<rec_t*>(page_rec_get_nth_const(page, nth));
+}
+
+/************************************************************//**
+Returns the middle record of the records on the page. If there is an
+even number of records in the list, returns the first record of the
+upper half-list.
+@return middle record */
+UNIV_INLINE
+rec_t*
+page_get_middle_rec(
+/*================*/
+	page_t*	page)	/*!< in: page */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Gets the page number.
+@return page number */
+UNIV_INLINE
+uint32_t
+page_get_page_no(
+/*=============*/
+	const page_t*	page);	/*!< in: page */
+
+/*************************************************************//**
+Gets the tablespace identifier.
+@return space id */
+UNIV_INLINE
+uint32_t
+page_get_space_id(
+/*==============*/
+	const page_t*	page);	/*!< in: page */
+
+/*************************************************************//**
+Gets the number of user records on page (the infimum and supremum records
+are not user records).
+@return number of user records */
+UNIV_INLINE
+uint16_t
+page_get_n_recs(
+/*============*/
+	const page_t*	page);	/*!< in: index page */
+
+/** Return the number of preceding records in an index page.
+@param rec index record
+@return number of preceding records, including the infimum pseudo-record
+@retval ULINT_UNDEFINED on corrupted page */
+ulint page_rec_get_n_recs_before(const rec_t *rec);
+/*************************************************************//**
+Gets the number of records in the heap.
+@return number of user records */
+UNIV_INLINE
+uint16_t
+page_dir_get_n_heap(
+/*================*/
+	const page_t*	page);	/*!< in: index page */
+/*************************************************************//**
+Gets the number of dir slots in directory.
+@return number of slots */
+UNIV_INLINE
+uint16_t
+page_dir_get_n_slots(
+/*=================*/
+	const page_t*	page);	/*!< in: index page */
+/** Gets the pointer to a directory slot.
+@param n  sparse directory slot number
+@return pointer to the sparse directory slot */
+inline page_dir_slot_t *page_dir_get_nth_slot(page_t *page, ulint n)
+{
+  ut_ad(page_dir_get_n_slots(page) > n);
+  static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
+  return my_assume_aligned<2>(page + srv_page_size - (PAGE_DIR + 2) - n * 2);
+}
+inline const page_dir_slot_t *page_dir_get_nth_slot(const page_t *page,ulint n)
+{
+  return page_dir_get_nth_slot(const_cast<page_t*>(page), n);
+}
+/**************************************************************//**
+Used to check the consistency of a record on a page.
+@return TRUE if succeed */
+UNIV_INLINE
+ibool
+page_rec_check(
+/*===========*/
+	const rec_t*	rec);	/*!< in: record */
+/** Get the record pointed to by a directory slot.
+@param[in] slot   directory slot
+@return pointer to record */
+inline rec_t *page_dir_slot_get_rec(page_dir_slot_t *slot)
+{
+  return page_align(slot) + mach_read_from_2(my_assume_aligned<2>(slot));
+}
+inline const rec_t *page_dir_slot_get_rec(const page_dir_slot_t *slot)
+{
+  return page_dir_slot_get_rec(const_cast<rec_t*>(slot));
+}
+
+inline rec_t *page_dir_slot_get_rec_validate(page_dir_slot_t *slot)
+{
+  const size_t s= mach_read_from_2(my_assume_aligned<2>(slot));
+  page_t *page= page_align(slot);
+
+  return UNIV_LIKELY(s >= PAGE_NEW_INFIMUM &&
+                     s <= page_header_get_field(page, PAGE_HEAP_TOP))
+    ? page + s
+    : nullptr;
+}
+inline const rec_t *page_dir_slot_get_rec_validate(const page_dir_slot_t *slot)
+{
+  return page_dir_slot_get_rec_validate(const_cast<rec_t*>(slot));
+}
+
+
+/***************************************************************//**
+Gets the number of records owned by a directory slot.
+@return number of records */
+UNIV_INLINE
+ulint
+page_dir_slot_get_n_owned(
+/*======================*/
+	const page_dir_slot_t*	slot);	/*!< in: page directory slot */
+/************************************************************//**
+Calculates the space reserved for directory slots of a given
+number of records. The exact value is a fraction number
+n * PAGE_DIR_SLOT_SIZE / PAGE_DIR_SLOT_MIN_N_OWNED, and it is
+rounded upwards to an integer. */
+UNIV_INLINE
+ulint
+page_dir_calc_reserved_space(
+/*=========================*/
+	ulint	n_recs);	/*!< in: number of records */
+/***************************************************************//**
+Looks for the directory slot which owns the given record.
+@return the directory slot number
+@retval ULINT_UNDEFINED on corruption */
+ulint
+page_dir_find_owner_slot(
+/*=====================*/
+	const rec_t*	rec);	/*!< in: the physical record */
+
+/***************************************************************//**
+Returns the heap number of a record.
+@return heap number */
+UNIV_INLINE
+ulint
+page_rec_get_heap_no(
+/*=================*/
+	const rec_t*	rec);	/*!< in: the physical record */
+/** Determine whether a page has any siblings.
+@param[in]	page	page frame
+@return true if the page has any siblings */
+inline bool page_has_siblings(const page_t* page)
+{
+	compile_time_assert(!(FIL_PAGE_PREV % 8));
+	compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4);
+	compile_time_assert(FIL_NULL == 0xffffffff);
+	return *reinterpret_cast<const uint64_t*>(page + FIL_PAGE_PREV)
+		!= ~uint64_t(0);
+}
+
+/** Determine whether a page has a predecessor.
+@param[in]	page	page frame
+@return true if the page has a predecessor */
+inline bool page_has_prev(const page_t* page)
+{
+	return *reinterpret_cast<const uint32_t*>(page + FIL_PAGE_PREV)
+		!= FIL_NULL;
+}
+
+/** Determine whether a page has a successor.
+@param[in]	page	page frame
+@return true if the page has a successor */
+inline bool page_has_next(const page_t* page)
+{
+	return *reinterpret_cast<const uint32_t*>(page + FIL_PAGE_NEXT)
+		!= FIL_NULL;
+}
+
+/** Read the AUTO_INCREMENT value from a clustered index root page.
+@param[in]	page	clustered index root page
+@return	the persisted AUTO_INCREMENT value */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline uint64_t page_get_autoinc(const page_t *page)
+{
+  ut_d(uint16_t page_type= fil_page_get_type(page));
+  ut_ad(page_type == FIL_PAGE_INDEX || page_type == FIL_PAGE_TYPE_INSTANT);
+  ut_ad(!page_has_siblings(page));
+  const auto *p= my_assume_aligned<8>(page + PAGE_HEADER + PAGE_ROOT_AUTO_INC);
+  return mach_read_from_8(p);
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_low(
+/*==================*/
+	const rec_t*	rec,	/*!< in: pointer to record */
+	ulint		comp);	/*!< in: nonzero=compact page layout */
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+rec_t*
+page_rec_get_next(
+/*==============*/
+	rec_t*	rec);	/*!< in: pointer to record */
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_const(
+/*====================*/
+	const rec_t*	rec);	/*!< in: pointer to record */
+/************************************************************//**
+Gets the pointer to the previous record.
+@return pointer to previous record
+@retval nullptr on error */
+const rec_t*
+page_rec_get_prev_const(
+/*====================*/
+	const rec_t*	rec);	/*!< in: pointer to record, must not be page
+				infimum */
+/************************************************************//**
+Gets the pointer to the previous record.
+@param rec  record (not page infimum)
+@return pointer to previous record
+@retval nullptr on error */
+inline rec_t *page_rec_get_prev(rec_t *rec)
+{
+  return const_cast<rec_t*>(page_rec_get_prev_const(rec));
+}
+
+/************************************************************//**
+true if the record is the first user record on a page.
+@return true if the first user record */
+UNIV_INLINE
+bool
+page_rec_is_first(
+/*==============*/
+	const rec_t*	rec,	/*!< in: record */
+	const page_t*	page)	/*!< in: page */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/************************************************************//**
+true if the record is the last user record on a page.
+@return true if the last user record */
+UNIV_INLINE
+bool
+page_rec_is_last(
+/*=============*/
+	const rec_t*	rec,	/*!< in: record */
+	const page_t*	page)	/*!< in: page */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of record heap.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size(
+/*=====================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		n_recs);/*!< in: number of records */
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of record heap if page is first reorganized.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size_after_reorganize(
+/*======================================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		n_recs);/*!< in: number of records */
+/*************************************************************//**
+Calculates free space if a page is emptied.
+@return free space */
+UNIV_INLINE
+ulint
+page_get_free_space_of_empty(
+/*=========================*/
+	ulint	comp)	/*!< in: nonzero=compact page format */
+		MY_ATTRIBUTE((const));
+/************************************************************//**
+Returns the sum of the sizes of the records in the record list
+excluding the infimum and supremum records.
+@return data in bytes */
+UNIV_INLINE
+uint16_t
+page_get_data_size(
+/*===============*/
+	const page_t*	page);	/*!< in: index page */
+/** Read the PAGE_DIRECTION field from a byte.
+@param[in]	ptr	pointer to PAGE_DIRECTION_B
+@return	the value of the PAGE_DIRECTION field */
+inline
+byte
+page_ptr_get_direction(const byte* ptr);
+
+/** Read the PAGE_DIRECTION field.
+@param[in]	page	index page
+@return	the value of the PAGE_DIRECTION field */
+inline
+byte
+page_get_direction(const page_t* page)
+{
+	return page_ptr_get_direction(PAGE_HEADER + PAGE_DIRECTION_B + page);
+}
+
+/** Read the PAGE_INSTANT field.
+@param[in]	page	index page
+@return the value of the PAGE_INSTANT field */
+inline
+uint16_t
+page_get_instant(const page_t* page);
+
+/** Create an uncompressed index page.
+@param[in,out]	block	buffer block
+@param[in,out]	mtr	mini-transaction
+@param[in]	comp	set unless ROW_FORMAT=REDUNDANT */
+void page_create(buf_block_t *block, mtr_t *mtr, bool comp);
+/**********************************************************//**
+Create a compressed B-tree index page. */
+void
+page_create_zip(
+/*============*/
+	buf_block_t*		block,		/*!< in/out: a buffer frame
+						where the page is created */
+	dict_index_t*		index,		/*!< in: the index of the
+						page */
+	ulint			level,		/*!< in: the B-tree level of
+						the page */
+	trx_id_t		max_trx_id,	/*!< in: PAGE_MAX_TRX_ID */
+	mtr_t*			mtr);		/*!< in/out: mini-transaction
+						handle */
+/**********************************************************//**
+Empty a previously created B-tree index page. */
+void
+page_create_empty(
+/*==============*/
+	buf_block_t*	block,	/*!< in/out: B-tree block */
+	dict_index_t*	index,	/*!< in: the index of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull(1,2)));
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/*************************************************************//**
+Differs from page_copy_rec_list_end, because this function does not
+touch the lock table and max trx id on page or compress the page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_t::commit().
+
+@return error code */
+dberr_t
+page_copy_rec_list_end_no_locks(
+/*============================*/
+	buf_block_t*	new_block,	/*!< in: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page of rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr);		/*!< in: mtr */
+/*************************************************************//**
+Copies records from page to new_page, from the given record onward,
+including that record. Infimum and supremum records are not copied.
+The records are copied to the start of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_t::commit().
+
+@return pointer to the original successor of the infimum record on new_block
+@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */
+rec_t*
+page_copy_rec_list_end(
+/*===================*/
+	buf_block_t*	new_block,	/*!< in/out: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page containing rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction */
+	dberr_t*	err)		/*!< out: error code */
+	MY_ATTRIBUTE((nonnull(1,2,3,4,5), warn_unused_result));
+/*************************************************************//**
+Copies records from page to new_page, up to the given record, NOT
+including that record. Infimum and supremum records are not copied.
+The records are copied to the end of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to the original predecessor of the supremum record on new_block
+@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */
+rec_t*
+page_copy_rec_list_start(
+/*=====================*/
+	buf_block_t*	new_block,	/*!< in/out: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page containing rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction */
+	dberr_t*	err)		/*!< out: error code */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Deletes records from a page from a given record onward, including that record.
+The infimum and supremum records are not deleted. */
+dberr_t
+page_delete_rec_list_end(
+/*=====================*/
+	rec_t*		rec,	/*!< in: pointer to record on page */
+	buf_block_t*	block,	/*!< in: buffer block of the page */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	ulint		n_recs,	/*!< in: number of records to delete,
+				or ULINT_UNDEFINED if not known */
+	ulint		size,	/*!< in: the sum of the sizes of the
+				records in the end of the chain to
+				delete, or ULINT_UNDEFINED if not known */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Deletes records from page, up to the given record, NOT including
+that record. Infimum and supremum records are not deleted. */
+void
+page_delete_rec_list_start(
+/*=======================*/
+	rec_t*		rec,	/*!< in: record on page */
+	buf_block_t*	block,	/*!< in: buffer block of the page */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((nonnull));
+/** Create an index page.
+@param[in,out]	block	buffer block
+@param[in]	comp	nonzero=compact page format */
+void page_create_low(const buf_block_t* block, bool comp);
+
+/************************************************************//**
+Prints record contents including the data relevant only in
+the index page context. */
+void
+page_rec_print(
+/*===========*/
+	const rec_t*	rec,	/*!< in: physical record */
+	const rec_offs*	offsets);/*!< in: record descriptor */
+# ifdef UNIV_BTR_PRINT
+/***************************************************************//**
+This is used to print the contents of the directory for
+debugging purposes. */
+void
+page_dir_print(
+/*===========*/
+	page_t*	page,	/*!< in: index page */
+	ulint	pr_n);	/*!< in: print n first and n last entries */
+/***************************************************************//**
+This is used to print the contents of the page record list for
+debugging purposes. */
+void
+page_print_list(
+/*============*/
+	buf_block_t*	block,	/*!< in: index page */
+	dict_index_t*	index,	/*!< in: dictionary index of the page */
+	ulint		pr_n);	/*!< in: print n first and n last entries */
+/***************************************************************//**
+Prints the info in a page header. */
+void
+page_header_print(
+/*==============*/
+	const page_t*	page);	/*!< in: index page */
+/***************************************************************//**
+This is used to print the contents of the page for
+debugging purposes. */
+void
+page_print(
+/*=======*/
+	buf_block_t*	block,	/*!< in: index page */
+	dict_index_t*	index,	/*!< in: dictionary index of the page */
+	ulint		dn,	/*!< in: print dn first and last entries
+				in directory */
+	ulint		rn);	/*!< in: print rn first and last records
+				in directory */
+# endif /* UNIV_BTR_PRINT */
+/***************************************************************//**
+The following is used to validate a record on a page. This function
+differs from rec_validate as it can also check the n_owned field and
+the heap_no field.
+@return TRUE if ok */
+ibool
+page_rec_validate(
+/*==============*/
+	const rec_t*	rec,	/*!< in: physical record */
+	const rec_offs*	offsets);/*!< in: array returned by rec_get_offsets() */
+#ifdef UNIV_DEBUG
+/***************************************************************//**
+Checks that the first directory slot points to the infimum record and
+the last to the supremum. This function is intended to track if the
+bug fixed in 4.0.14 has caused corruption to users' databases. */
+void
+page_check_dir(
+/*===========*/
+	const page_t*	page);	/*!< in: index page */
+#endif /* UNIV_DEBUG */
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return TRUE if ok */
+ibool
+page_simple_validate_old(
+/*=====================*/
+	const page_t*	page);	/*!< in: index page in ROW_FORMAT=REDUNDANT */
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return TRUE if ok */
+ibool
+page_simple_validate_new(
+/*=====================*/
+	const page_t*	page);	/*!< in: index page in ROW_FORMAT!=REDUNDANT */
+/** Check the consistency of an index page.
+@param[in]	page	index page
+@param[in]	index	B-tree or R-tree index
+@return	whether the page is valid */
+bool page_validate(const page_t* page, const dict_index_t* index)
+	MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Looks in the page record list for a record with the given heap number.
+@return record, NULL if not found */
+const rec_t*
+page_find_rec_with_heap_no(
+/*=======================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		heap_no);/*!< in: heap number */
+/** Get the last non-delete-marked record on a page.
+@param[in]	page	index tree leaf page
+@return the last record, not delete-marked
+@retval infimum record if all records are delete-marked */
+const rec_t *page_find_rec_last_not_deleted(const page_t *page);
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#include "page0page.inl"
+
+#endif
diff --git a/storage/innobase/include/page0page.inl b/storage/innobase/include/page0page.inl
new file mode 100644
index 00000000..6c0167ed
--- /dev/null
+++ b/storage/innobase/include/page0page.inl
@@ -0,0 +1,550 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0page.ic
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_INNOCHECKSUM
+#include "rem0cmp.h"
+#include "mtr0log.h"
+#include "page0zip.h"
+
+/*************************************************************//**
+Sets the max trx id field value if trx_id is bigger than the previous
+value. */
+UNIV_INLINE
+void
+page_update_max_trx_id(
+/*===================*/
+	buf_block_t*	block,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(block);
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(trx_id);
+	ut_ad(page_is_leaf(buf_block_get_frame(block)));
+
+	if (page_get_max_trx_id(buf_block_get_frame(block)) < trx_id) {
+
+		page_set_max_trx_id(block, page_zip, trx_id, mtr);
+	}
+}
+
+/*************************************************************//**
+Returns the RTREE SPLIT SEQUENCE NUMBER (FIL_RTREE_SPLIT_SEQ_NUM).
+@return	SPLIT SEQUENCE NUMBER */
+UNIV_INLINE
+node_seq_t
+page_get_ssn_id(
+/*============*/
+	const page_t*	page)	/*!< in: page */
+{
+	ut_ad(page);
+
+	return(static_cast<node_seq_t>(
+		mach_read_from_8(page + FIL_RTREE_SPLIT_SEQ_NUM)));
+}
+
+/*************************************************************//**
+Sets the RTREE SPLIT SEQUENCE NUMBER field value */
+UNIV_INLINE
+void
+page_set_ssn_id(
+/*============*/
+	buf_block_t*	block,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	node_seq_t	ssn_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_SX_FIX |
+                                   MTR_MEMO_PAGE_X_FIX));
+  ut_ad(!page_zip || page_zip == &block->page.zip);
+  constexpr uint16_t field= FIL_RTREE_SPLIT_SEQ_NUM;
+  byte *b= my_assume_aligned<2>(&block->page.frame[field]);
+  if (mtr->write<8,mtr_t::MAYBE_NOP>(*block, b, ssn_id) &&
+      UNIV_LIKELY_NULL(page_zip))
+    memcpy_aligned<2>(&page_zip->data[field], b, 8);
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#ifndef UNIV_INNOCHECKSUM
+/*************************************************************//**
+Returns the offset stored in the given header field.
+@return offset from the start of the page, or 0 */
+UNIV_INLINE
+uint16_t
+page_header_get_offs(
+/*=================*/
+	const page_t*	page,	/*!< in: page */
+	ulint		field)	/*!< in: PAGE_FREE, ... */
+{
+	ut_ad((field == PAGE_FREE)
+	      || (field == PAGE_LAST_INSERT)
+	      || (field == PAGE_HEAP_TOP));
+
+	uint16_t offs = page_header_get_field(page, field);
+
+	ut_ad((field != PAGE_HEAP_TOP) || offs);
+
+	return(offs);
+}
+
+
+/**
+Reset PAGE_LAST_INSERT.
+@param[in,out]  block    file page
+@param[in,out]  mtr      mini-transaction */
+inline void page_header_reset_last_insert(buf_block_t *block, mtr_t *mtr)
+{
+  constexpr uint16_t field= PAGE_HEADER + PAGE_LAST_INSERT;
+  byte *b= my_assume_aligned<2>(&block->page.frame[field]);
+  if (mtr->write<2,mtr_t::MAYBE_NOP>(*block, b, 0U) &&
+      UNIV_LIKELY_NULL(block->page.zip.data))
+    memset_aligned<2>(&block->page.zip.data[field], 0, 2);
+}
+
+/***************************************************************//**
+Returns the heap number of a record.
+@return heap number */
+UNIV_INLINE
+ulint
+page_rec_get_heap_no(
+/*=================*/
+	const rec_t*	rec)	/*!< in: the physical record */
+{
+	if (page_rec_is_comp(rec)) {
+		return(rec_get_heap_no_new(rec));
+	} else {
+		return(rec_get_heap_no_old(rec));
+	}
+}
+
+/** Determine whether an index page record is a user record.
+@param[in]	rec	record in an index page
+@return true if a user record */
+inline
+bool
+page_rec_is_user_rec(const rec_t* rec)
+{
+	ut_ad(page_rec_check(rec));
+	return(page_rec_is_user_rec_low(page_offset(rec)));
+}
+
+/** Determine whether an index page record is the supremum record.
+@param[in]	rec	record in an index page
+@return true if the supremum record */
+inline
+bool
+page_rec_is_supremum(const rec_t* rec)
+{
+	ut_ad(page_rec_check(rec));
+	return(page_rec_is_supremum_low(page_offset(rec)));
+}
+
+/** Determine whether an index page record is the infimum record.
+@param[in]	rec	record in an index page
+@return true if the infimum record */
+inline
+bool
+page_rec_is_infimum(const rec_t* rec)
+{
+	ut_ad(page_rec_check(rec));
+	return(page_rec_is_infimum_low(page_offset(rec)));
+}
+
+/************************************************************//**
+true if the record is the first user record on a page.
+@return true if the first user record */
+UNIV_INLINE
+bool
+page_rec_is_first(
+/*==============*/
+	const rec_t*	rec,	/*!< in: record */
+	const page_t*	page)	/*!< in: page */
+{
+	ut_ad(page_get_n_recs(page) > 0);
+
+	return(page_rec_get_next_const(page_get_infimum_rec(page)) == rec);
+}
+
+/************************************************************//**
+true if the record is the last user record on a page.
+@return true if the last user record */
+UNIV_INLINE
+bool
+page_rec_is_last(
+/*=============*/
+	const rec_t*	rec,	/*!< in: record */
+	const page_t*	page)	/*!< in: page */
+{
+	ut_ad(page_get_n_recs(page) > 0);
+
+	return(page_rec_get_next_const(rec) == page_get_supremum_rec(page));
+}
+
+/************************************************************//**
+Returns the middle record of the records on the page. If there is an
+even number of records in the list, returns the first record of the
+upper half-list.
+@return middle record */
+UNIV_INLINE
+rec_t*
+page_get_middle_rec(
+/*================*/
+	page_t*	page)	/*!< in: page */
+{
+	ulint	middle = (ulint(page_get_n_recs(page))
+			  + PAGE_HEAP_NO_USER_LOW) / 2;
+
+	return(page_rec_get_nth(page, middle));
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Gets the page number.
+@return page number */
+UNIV_INLINE
+uint32_t
+page_get_page_no(
+/*=============*/
+	const page_t*	page)	/*!< in: page */
+{
+  ut_ad(page == page_align((page_t*) page));
+  return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_OFFSET));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/*************************************************************//**
+Gets the tablespace identifier.
+@return space id */
+UNIV_INLINE
+uint32_t
+page_get_space_id(
+/*==============*/
+	const page_t*	page)	/*!< in: page */
+{
+  ut_ad(page == page_align((page_t*) page));
+  return mach_read_from_4(my_assume_aligned<2>
+                          (page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Gets the number of user records on page (infimum and supremum records
+are not user records).
+@return number of user records */
+UNIV_INLINE
+uint16_t
+page_get_n_recs(
+/*============*/
+	const page_t*	page)	/*!< in: index page */
+{
+	return(page_header_get_field(page, PAGE_N_RECS));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/*************************************************************//**
+Gets the number of dir slots in directory.
+@return number of slots */
+UNIV_INLINE
+uint16_t
+page_dir_get_n_slots(
+/*=================*/
+	const page_t*	page)	/*!< in: index page */
+{
+	return(page_header_get_field(page, PAGE_N_DIR_SLOTS));
+}
+
+/*************************************************************//**
+Gets the number of records in the heap.
+@return number of user records */
+UNIV_INLINE
+uint16_t
+page_dir_get_n_heap(
+/*================*/
+	const page_t*	page)	/*!< in: index page */
+{
+	return(page_header_get_field(page, PAGE_N_HEAP) & 0x7fff);
+}
+
+/**************************************************************//**
+Used to check the consistency of a record on a page.
+@return TRUE if succeed */
+UNIV_INLINE
+ibool
+page_rec_check(
+/*===========*/
+	const rec_t*	rec)	/*!< in: record */
+{
+	const page_t*	page = page_align(rec);
+
+	ut_a(rec);
+
+	ut_a(page_offset(rec) <= page_header_get_field(page, PAGE_HEAP_TOP));
+	ut_a(page_offset(rec) >= PAGE_DATA);
+
+	return(TRUE);
+}
+
+/***************************************************************//**
+Gets the number of records owned by a directory slot.
+@return number of records */
+UNIV_INLINE
+ulint
+page_dir_slot_get_n_owned(
+/*======================*/
+	const page_dir_slot_t*	slot)	/*!< in: page directory slot */
+{
+	const rec_t*	rec	= page_dir_slot_get_rec(slot);
+	if (page_rec_is_comp(slot)) {
+		return(rec_get_n_owned_new(rec));
+	} else {
+		return(rec_get_n_owned_old(rec));
+	}
+}
+
+/************************************************************//**
+Calculates the space reserved for directory slots of a given number of
+records. The exact value is a fraction number n * PAGE_DIR_SLOT_SIZE /
+PAGE_DIR_SLOT_MIN_N_OWNED, and it is rounded upwards to an integer. */
+UNIV_INLINE
+ulint
+page_dir_calc_reserved_space(
+/*=========================*/
+	ulint	n_recs)		/*!< in: number of records */
+{
+	return((PAGE_DIR_SLOT_SIZE * n_recs + PAGE_DIR_SLOT_MIN_N_OWNED - 1)
+	       / PAGE_DIR_SLOT_MIN_N_OWNED);
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_low(
+/*==================*/
+	const rec_t*	rec,	/*!< in: pointer to record */
+	ulint		comp)	/*!< in: nonzero=compact page layout */
+{
+  const page_t *page= page_align(rec);
+  ut_ad(page_rec_check(rec));
+  ulint offs= rec_get_next_offs(rec, comp);
+  if (!offs)
+    return nullptr;
+  if (UNIV_UNLIKELY(offs < (comp ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM)))
+    return nullptr;
+  if (UNIV_UNLIKELY(offs > page_header_get_field(page, PAGE_HEAP_TOP)))
+    return nullptr;
+  ut_ad(page_rec_is_infimum(rec) ||
+        (!page_is_leaf(page) && !page_has_prev(page)) ||
+        !(rec_get_info_bits(page + offs, comp) & REC_INFO_MIN_REC_FLAG));
+  return page + offs;
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+rec_t*
+page_rec_get_next(
+/*==============*/
+	rec_t*	rec)	/*!< in: pointer to record */
+{
+	return((rec_t*) page_rec_get_next_low(rec, page_rec_is_comp(rec)));
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_const(
+/*====================*/
+	const rec_t*	rec)	/*!< in: pointer to record */
+{
+	return(page_rec_get_next_low(rec, page_rec_is_comp(rec)));
+}
+#endif /* UNIV_INNOCHECKSUM */
+
+/************************************************************//**
+Returns the sum of the sizes of the records in the record list, excluding
+the infimum and supremum records.
+@return data in bytes */
+UNIV_INLINE
+uint16_t
+page_get_data_size(
+/*===============*/
+	const page_t*	page)	/*!< in: index page */
+{
+	unsigned ret = page_header_get_field(page, PAGE_HEAP_TOP)
+		- (page_is_comp(page)
+		   ? PAGE_NEW_SUPREMUM_END
+		   : PAGE_OLD_SUPREMUM_END)
+		- page_header_get_field(page, PAGE_GARBAGE);
+	ut_ad(ret < srv_page_size);
+	return static_cast<uint16_t>(ret);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/*************************************************************//**
+Calculates free space if a page is emptied.
+@return free space */
+UNIV_INLINE
+ulint
+page_get_free_space_of_empty(
+/*=========================*/
+	ulint	comp)		/*!< in: nonzero=compact page layout */
+{
+	if (comp) {
+		return((ulint)(srv_page_size
+			       - PAGE_NEW_SUPREMUM_END
+			       - PAGE_DIR
+			       - 2 * PAGE_DIR_SLOT_SIZE));
+	}
+
+	return((ulint)(srv_page_size
+		       - PAGE_OLD_SUPREMUM_END
+		       - PAGE_DIR
+		       - 2 * PAGE_DIR_SLOT_SIZE));
+}
+
+/************************************************************//**
+Each user record on a page, and also the deleted user records in the heap
+takes its size plus the fraction of the dir cell size /
+PAGE_DIR_SLOT_MIN_N_OWNED bytes for it. If the sum of these exceeds the
+value of page_get_free_space_of_empty, the insert is impossible, otherwise
+it is allowed. This function returns the maximum combined size of records
+which can be inserted on top of the record heap.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size(
+/*=====================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		n_recs)	/*!< in: number of records */
+{
+	ulint	occupied;
+	ulint	free_space;
+
+	if (page_is_comp(page)) {
+		occupied = page_header_get_field(page, PAGE_HEAP_TOP)
+			- PAGE_NEW_SUPREMUM_END
+			+ page_dir_calc_reserved_space(
+				n_recs + page_dir_get_n_heap(page) - 2);
+
+		free_space = page_get_free_space_of_empty(TRUE);
+	} else {
+		occupied = page_header_get_field(page, PAGE_HEAP_TOP)
+			- PAGE_OLD_SUPREMUM_END
+			+ page_dir_calc_reserved_space(
+				n_recs + page_dir_get_n_heap(page) - 2);
+
+		free_space = page_get_free_space_of_empty(FALSE);
+	}
+
+	/* Above the 'n_recs +' part reserves directory space for the new
+	inserted records; the '- 2' excludes page infimum and supremum
+	records */
+
+	if (occupied > free_space) {
+
+		return(0);
+	}
+
+	return(free_space - occupied);
+}
+
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of the record heap if a page is first reorganized.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size_after_reorganize(
+/*======================================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		n_recs)	/*!< in: number of records */
+{
+	ulint	occupied;
+	ulint	free_space;
+
+	occupied = page_get_data_size(page)
+		+ page_dir_calc_reserved_space(n_recs + page_get_n_recs(page));
+
+	free_space = page_get_free_space_of_empty(page_is_comp(page));
+
+	if (occupied > free_space) {
+
+		return(0);
+	}
+
+	return(free_space - occupied);
+}
+
+/** Read the PAGE_DIRECTION field from a byte.
+@param[in]	ptr	pointer to PAGE_DIRECTION_B
+@return	the value of the PAGE_DIRECTION field */
+inline
+byte
+page_ptr_get_direction(const byte* ptr)
+{
+	ut_ad(page_offset(ptr) == PAGE_HEADER + PAGE_DIRECTION_B);
+	return *ptr & ((1U << 3) - 1);
+}
+
+/** Read the PAGE_INSTANT field.
+@param[in]	page	index page
+@return the value of the PAGE_INSTANT field */
+inline
+uint16_t
+page_get_instant(const page_t* page)
+{
+	uint16_t i = page_header_get_field(page, PAGE_INSTANT);
+#ifdef UNIV_DEBUG
+	switch (fil_page_get_type(page)) {
+	case FIL_PAGE_TYPE_INSTANT:
+		ut_ad(page_get_direction(page) <= PAGE_NO_DIRECTION);
+		ut_ad(i >> 3);
+		break;
+	case FIL_PAGE_INDEX:
+		ut_ad(i <= PAGE_NO_DIRECTION || !page_is_comp(page));
+		break;
+	case FIL_PAGE_RTREE:
+		ut_ad(i <= PAGE_NO_DIRECTION);
+		break;
+	default:
+		ut_ad("invalid page type" == 0);
+		break;
+	}
+#endif /* UNIV_DEBUG */
+	return static_cast<uint16_t>(i >> 3);  /* i / 8 */
+}
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/page0types.h b/storage/innobase/include/page0types.h
new file mode 100644
index 00000000..83fc45cd
--- /dev/null
+++ b/storage/innobase/include/page0types.h
@@ -0,0 +1,188 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0types.h
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef page0types_h
+#define page0types_h
+
+#include "dict0types.h"
+#include "mtr0types.h"
+#include "rem0types.h"
+#include "ut0new.h"
+
+#include <map>
+
+/** Eliminates a name collision on HP-UX */
+#define page_t	   ib_page_t
+/** Type of the index page */
+typedef	byte		page_t;
+#ifndef UNIV_INNOCHECKSUM
+/** Index page cursor */
+struct page_cur_t;
+/** Buffer pool block */
+struct buf_block_t;
+
+/** Compressed index page */
+typedef byte		page_zip_t;
+
+/* The following definitions would better belong to page0zip.h,
+but we cannot include page0zip.h from rem0rec.ic, because
+page0*.h includes rem0rec.h and may include rem0rec.ic. */
+
+/** Number of bits needed for representing different compressed page sizes */
+#define PAGE_ZIP_SSIZE_BITS 3
+
+/** Maximum compressed page shift size */
+#define PAGE_ZIP_SSIZE_MAX	\
+	(UNIV_ZIP_SIZE_SHIFT_MAX - UNIV_ZIP_SIZE_SHIFT_MIN + 1)
+
+/* Make sure there are enough bits available to store the maximum zip
+ssize, which is the number of shifts from 512. */
+#if PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS)
+# error "PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS)"
+#endif
+
+/* Page cursor search modes; the values must be in this order! */
+enum page_cur_mode_t {
+	PAGE_CUR_UNSUPP	= 0,
+	PAGE_CUR_G	= 1,
+	PAGE_CUR_GE	= 2,
+	PAGE_CUR_L	= 3,
+	PAGE_CUR_LE	= 4,
+
+/*      PAGE_CUR_LE_OR_EXTENDS = 5,*/ /* This is a search mode used in
+				 "column LIKE 'abc%' ORDER BY column DESC";
+				 we have to find strings which are <= 'abc' or
+				 which extend it */
+
+/* These search mode is for search R-tree index. */
+	PAGE_CUR_CONTAIN		= 7,
+	PAGE_CUR_INTERSECT		= 8,
+	PAGE_CUR_WITHIN			= 9,
+	PAGE_CUR_DISJOINT		= 10,
+	PAGE_CUR_MBR_EQUAL		= 11,
+	PAGE_CUR_RTREE_INSERT		= 12,
+	PAGE_CUR_RTREE_LOCATE		= 13,
+	PAGE_CUR_RTREE_GET_FATHER	= 14
+};
+
+class buf_pool_t;
+class buf_page_t;
+
+/** Compressed page descriptor */
+struct page_zip_des_t
+{
+	page_zip_t*	data;		/*!< compressed page data */
+
+	uint32_t	m_end:16;	/*!< end offset of modification log */
+	uint32_t	m_nonempty:1;	/*!< TRUE if the modification log
+					is not empty */
+	uint32_t	n_blobs:12;	/*!< number of externally stored
+					columns on the page; the maximum
+					is 744 on a 16 KiB page */
+	uint32_t	ssize:PAGE_ZIP_SSIZE_BITS;
+					/*!< 0 or compressed page shift size;
+					the size in bytes is
+					(UNIV_ZIP_SIZE_MIN >> 1) << ssize. */
+#ifdef UNIV_DEBUG
+	uint16_t	m_start;	/*!< start offset of modification log */
+	bool		m_external;	/*!< Allocated externally, not from the
+					buffer pool */
+#endif /* UNIV_DEBUG */
+
+	void clear() {
+		/* Clear everything except the member "fix". */
+		memset((void*) this, 0,
+		       reinterpret_cast<char*>(&fix)
+		       - reinterpret_cast<char*>(this));
+	}
+
+	page_zip_des_t() = default;
+	page_zip_des_t(const page_zip_des_t&) = default;
+
+	/* Initialize everything except the member "fix". */
+	page_zip_des_t(const page_zip_des_t& old, bool) {
+		memcpy((void*) this, (void*) &old,
+		       reinterpret_cast<char*>(&fix)
+		       - reinterpret_cast<char*>(this));
+	}
+
+private:
+	friend buf_pool_t;
+	friend buf_page_t;
+	/** fix count and state used in buf_page_t */
+	Atomic_relaxed<uint32_t> fix;
+};
+
+/** Compression statistics for a given page size */
+struct page_zip_stat_t {
+	/** Number of page compressions */
+	ulint		compressed;
+	/** Number of successful page compressions */
+	ulint		compressed_ok;
+	/** Number of page decompressions */
+	ulint		decompressed;
+	/** Duration of page compressions in microseconds */
+	ib_uint64_t	compressed_usec;
+	/** Duration of page decompressions in microseconds */
+	ib_uint64_t	decompressed_usec;
+	page_zip_stat_t() :
+		/* Initialize members to 0 so that when we do
+		stlmap[key].compressed++ and element with "key" does not
+		exist it gets inserted with zeroed members. */
+		compressed(0),
+		compressed_ok(0),
+		decompressed(0),
+		compressed_usec(0),
+		decompressed_usec(0)
+	{ }
+};
+
+/** Compression statistics types */
+typedef std::map<
+	index_id_t,
+	page_zip_stat_t,
+	std::less<index_id_t>,
+	ut_allocator<std::pair<const index_id_t, page_zip_stat_t> > >
+	page_zip_stat_per_index_t;
+
+/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */
+extern page_zip_stat_t			page_zip_stat[PAGE_ZIP_SSIZE_MAX];
+/** Statistics on compression, indexed by dict_index_t::id */
+extern page_zip_stat_per_index_t	page_zip_stat_per_index;
+
+/**********************************************************************//**
+Write the "owned" flag of a record on a compressed page.  The n_owned field
+must already have been written on the uncompressed page. */
+void
+page_zip_rec_set_owned(
+/*===================*/
+	buf_block_t*	block,	/*!< in/out: ROW_FORMAT=COMPRESSED page */
+	const byte*	rec,	/*!< in: record on the uncompressed page */
+	ulint		flag,	/*!< in: the owned flag (nonzero=TRUE) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull));
+#endif /* !UNIV_INNOCHECKSUM */
+#endif
diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h
new file mode 100644
index 00000000..43329906
--- /dev/null
+++ b/storage/innobase/include/page0zip.h
@@ -0,0 +1,383 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0zip.h
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#ifndef page0zip_h
+#define page0zip_h
+
+#include "buf0types.h"
+
+#ifndef UNIV_INNOCHECKSUM
+#include "mtr0types.h"
+#include "page0types.h"
+#include "dict0types.h"
+#include "srv0srv.h"
+#include "trx0types.h"
+#include "mem0mem.h"
+
+/* Compression level to be used by zlib. Settable by user. */
+extern uint	page_zip_level;
+
+/* Default compression level. */
+#define DEFAULT_COMPRESSION_LEVEL	6
+/** Start offset of the area that will be compressed */
+#define PAGE_ZIP_START			PAGE_NEW_SUPREMUM_END
+/** Size of an compressed page directory entry */
+#define PAGE_ZIP_DIR_SLOT_SIZE		2
+/** Predefine the sum of DIR_SLOT, TRX_ID & ROLL_PTR */
+#define PAGE_ZIP_CLUST_LEAF_SLOT_SIZE		\
+		(PAGE_ZIP_DIR_SLOT_SIZE		\
+		+ DATA_TRX_ID_LEN		\
+		+ DATA_ROLL_PTR_LEN)
+/** Mask of record offsets */
+#define PAGE_ZIP_DIR_SLOT_MASK		0x3fffU
+/** 'owned' flag */
+#define PAGE_ZIP_DIR_SLOT_OWNED		0x4000U
+/** 'deleted' flag */
+#define PAGE_ZIP_DIR_SLOT_DEL		0x8000U
+
+/**********************************************************************//**
+Determine the size of a compressed page in bytes.
+@return size in bytes */
+UNIV_INLINE
+ulint
+page_zip_get_size(
+/*==============*/
+	const page_zip_des_t*	page_zip)	/*!< in: compressed page */
+	MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************************//**
+Set the size of a compressed page in bytes. */
+UNIV_INLINE
+void
+page_zip_set_size(
+/*==============*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	ulint		size);		/*!< in: size in bytes */
+
+/** Determine if a record is so big that it needs to be stored externally.
+@param[in]	rec_size	length of the record in bytes
+@param[in]	comp		nonzero=compact format
+@param[in]	n_fields	number of fields in the record; ignored if
+tablespace is not compressed
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@return false if the entire record can be stored locally on the page */
+inline bool page_zip_rec_needs_ext(ulint rec_size, ulint comp, ulint n_fields,
+				   ulint zip_size)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/**********************************************************************//**
+Determine the guaranteed free space on an empty page.
+@return minimum payload size on the page */
+ulint
+page_zip_empty_size(
+/*================*/
+	ulint	n_fields,	/*!< in: number of columns in the index */
+	ulint	zip_size)	/*!< in: compressed page size in bytes */
+	MY_ATTRIBUTE((const));
+
+/** Check whether a tuple is too big for compressed table
+@param[in]	index	dict index object
+@param[in]	entry	entry for the index
+@return	true if it's too big, otherwise false */
+bool
+page_zip_is_too_big(
+	const dict_index_t*	index,
+	const dtuple_t*		entry);
+
+/**********************************************************************//**
+Initialize a compressed page descriptor. */
+#define page_zip_des_init(page_zip) (page_zip)->clear()
+
+/**********************************************************************//**
+Configure the zlib allocator to use the given memory heap. */
+void
+page_zip_set_alloc(
+/*===============*/
+	void*		stream,		/*!< in/out: zlib stream */
+	mem_heap_t*	heap);		/*!< in: memory heap to use */
+
+/** Attempt to compress a ROW_FORMAT=COMPRESSED page.
+@retval true on success
+@retval false on failure; block->page.zip will be left intact. */
+bool
+page_zip_compress(
+	buf_block_t*		block,	/*!< in/out: buffer block */
+	dict_index_t*		index,	/*!< in: index of the B-tree node */
+	ulint			level,	/*!< in: commpression level */
+	mtr_t*			mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull));
+
+/**********************************************************************//**
+Write the index information for the compressed page.
+@return used size of buf */
+ulint
+page_zip_fields_encode(
+/*===================*/
+	ulint			n,	/*!< in: number of fields
+					to compress */
+	const dict_index_t*	index,	/*!< in: index comprising
+					at least n fields */
+	ulint			trx_id_pos,
+					/*!< in: position of the trx_id column
+					in the index, or ULINT_UNDEFINED if
+					this is a non-leaf page */
+	byte*			buf);	/*!< out: buffer of (n + 1) * 2 bytes */
+
+/**********************************************************************//**
+Decompress a page.  This function should tolerate errors on the compressed
+page.  Instead of letting assertions fail, it will return FALSE if an
+inconsistency is detected.
+@return TRUE on success, FALSE on failure */
+ibool
+page_zip_decompress(
+/*================*/
+	page_zip_des_t*	page_zip,/*!< in: data, ssize;
+				out: m_start, m_end, m_nonempty, n_blobs */
+	page_t*		page,	/*!< out: uncompressed page, may be trashed */
+	ibool		all)	/*!< in: TRUE=decompress the whole page;
+				FALSE=verify but do not copy some
+				page header fields that should not change
+				after page creation */
+	MY_ATTRIBUTE((nonnull(1,2)));
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate a compressed page descriptor.
+@return TRUE if ok */
+UNIV_INLINE
+ibool
+page_zip_simple_validate(
+/*=====================*/
+	const page_zip_des_t*	page_zip);	/*!< in: compressed page
+						descriptor */
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_ZIP_DEBUG
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return TRUE if valid, FALSE if not */
+ibool
+page_zip_validate_low(
+/*==================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	const page_t*		page,	/*!< in: uncompressed page */
+	const dict_index_t*	index,	/*!< in: index of the page, if known */
+	ibool			sloppy)	/*!< in: FALSE=strict,
+					TRUE=ignore the MIN_REC_FLAG */
+	MY_ATTRIBUTE((nonnull(1,2)));
+/**********************************************************************//**
+Check that the compressed and decompressed pages match. */
+ibool
+page_zip_validate(
+/*==============*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	const page_t*		page,	/*!< in: uncompressed page */
+	const dict_index_t*	index)	/*!< in: index of the page, if known */
+	MY_ATTRIBUTE((nonnull(1,2)));
+#endif /* UNIV_ZIP_DEBUG */
+
+/**********************************************************************//**
+Determine how big record can be inserted without recompressing the page.
+@return a positive number indicating the maximum size of a record
+whose insertion is guaranteed to succeed, or zero or negative */
+UNIV_INLINE
+lint
+page_zip_max_ins_size(
+/*==================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	ibool			is_clust)/*!< in: TRUE if clustered index */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/**********************************************************************//**
+Determine if enough space is available in the modification log.
+@return TRUE if page_zip_write_rec() will succeed */
+UNIV_INLINE
+ibool
+page_zip_available(
+/*===============*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	ibool			is_clust,/*!< in: TRUE if clustered index */
+	ulint			length,	/*!< in: combined size of the record */
+	ulint			create)	/*!< in: nonzero=add the record to
+					the heap */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Write an entire record to the ROW_FORMAT=COMPRESSED page.
+The data must already have been written to the uncompressed page.
+@param[in,out]	block		ROW_FORMAT=COMPRESSED page
+@param[in]	rec		record in the uncompressed page
+@param[in]	index		the index that the page belongs to
+@param[in]	offsets		rec_get_offsets(rec, index)
+@param[in]	create		nonzero=insert, zero=update
+@param[in,out]	mtr		mini-transaction */
+void page_zip_write_rec(buf_block_t *block, const byte *rec,
+                        const dict_index_t *index, const rec_offs *offsets,
+                        ulint create, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
+
+/**********************************************************************//**
+Write a BLOB pointer of a record on the leaf page of a clustered index.
+The information must already have been updated on the uncompressed page. */
+void
+page_zip_write_blob_ptr(
+/*====================*/
+	buf_block_t*	block,	/*!< in/out: ROW_FORMAT=COMPRESSED page */
+	const byte*	rec,	/*!< in/out: record whose data is being
+				written */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint		n,	/*!< in: column index */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull));
+
+/**********************************************************************//**
+Write the node pointer of a record on a non-leaf compressed page. */
+void
+page_zip_write_node_ptr(
+/*====================*/
+	buf_block_t*	block,	/*!< in/out: compressed page */
+	byte*		rec,	/*!< in/out: record */
+	ulint		size,	/*!< in: data size of rec */
+	ulint		ptr,	/*!< in: node pointer */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull));
+
+/** Write the DB_TRX_ID,DB_ROLL_PTR into a clustered index leaf page record.
+@param[in,out]	block		ROW_FORMAT=COMPRESSED page
+@param[in,out]	rec		record
+@param[in]	offsets		rec_get_offsets(rec, index)
+@param[in]	trx_id_field	field number of DB_TRX_ID (number of PK fields)
+@param[in]	trx_id		DB_TRX_ID value (transaction identifier)
+@param[in]	roll_ptr	DB_ROLL_PTR value (undo log pointer)
+@param[in,out]	mtr		mini-transaction */
+void
+page_zip_write_trx_id_and_roll_ptr(
+	buf_block_t*	block,
+	byte*		rec,
+	const rec_offs*	offsets,
+	ulint		trx_id_col,
+	trx_id_t	trx_id,
+	roll_ptr_t	roll_ptr,
+	mtr_t*		mtr)
+	MY_ATTRIBUTE((nonnull));
+
+/** Modify the delete-mark flag of a ROW_FORMAT=COMPRESSED record.
+@param[in,out]  block   buffer block
+@param[in,out]  rec     record on a physical index page
+@param[in]      flag    the value of the delete-mark flag
+@param[in,out]  mtr     mini-transaction  */
+void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag,
+                              mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
+
+/**********************************************************************//**
+Insert a record to the dense page directory. */
+void
+page_zip_dir_insert(
+/*================*/
+	page_cur_t*	cursor,	/*!< in/out: page cursor */
+	uint16_t	free_rec,/*!< in: record from which rec was
+				allocated, or 0 */
+	byte*		rec,	/*!< in: record to insert */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull(1,3,4)));
+
+/** Shift the dense page directory and the array of BLOB pointers
+when a record is deleted.
+@param[in,out]  block   index page
+@param[in,out]  rec     record being deleted
+@param[in]      index   the index that the page belongs to
+@param[in]      offsets rec_get_offsets(rec, index)
+@param[in]	free	previous start of the free list
+@param[in,out]  mtr     mini-transaction */
+void page_zip_dir_delete(buf_block_t *block, byte *rec,
+                         const dict_index_t *index, const rec_offs *offsets,
+                         const byte *free, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull(1,2,3,4,6)));
+
+/**********************************************************************//**
+Reorganize and compress a page.  This is a low-level operation for
+compressed pages, to be used when page_zip_compress() fails.
+On success, redo log will be written.
+The function btr_page_reorganize() should be preferred whenever possible.
+IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
+non-clustered index, the caller must update the insert buffer free
+bits in the same mini-transaction in such a way that the modification
+will be redo-logged.
+@return error code
+@retval DB_FAIL on overflow; the block_zip will be left intact */
+dberr_t
+page_zip_reorganize(
+	buf_block_t*	block,	/*!< in/out: page with compressed page;
+				on the compressed page, in: size;
+				out: data, n_blobs,
+				m_start, m_end, m_nonempty */
+	dict_index_t*	index,	/*!< in: index of the B-tree node */
+	ulint		z_level,/*!< in: compression level */
+	mtr_t*		mtr,	/*!< in: mini-transaction */
+	bool		restore = false)/*!< whether to restore on failure */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**********************************************************************//**
+Copy the records of a page byte for byte.  Do not copy the page header
+or trailer, except those B-tree header fields that are directly
+related to the storage of records.  Also copy PAGE_MAX_TRX_ID.
+NOTE: The caller must update the lock table and the adaptive hash index. */
+void
+page_zip_copy_recs(
+	buf_block_t*		block,		/*!< in/out: buffer block */
+	const page_zip_des_t*	src_zip,	/*!< in: compressed page */
+	const page_t*		src,		/*!< in: page */
+	dict_index_t*		index,		/*!< in: index of the B-tree */
+	mtr_t*			mtr);		/*!< in: mini-transaction */
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Calculate the compressed page checksum.
+@param data		compressed page
+@param size		size of compressed page
+@param use_adler	whether to use Adler32 instead of a XOR of 3 CRC-32C
+@return page checksum */
+uint32_t page_zip_calc_checksum(const void *data, size_t size, bool use_adler);
+
+/** Validate the checksum on a ROW_FORMAT=COMPRESSED page.
+@param data    ROW_FORMAT=COMPRESSED page
+@param size    size of the page, in bytes
+@return whether the stored checksum matches innodb_checksum_algorithm */
+bool page_zip_verify_checksum(const byte *data, size_t size);
+
+#ifndef UNIV_INNOCHECKSUM
+/**********************************************************************//**
+Reset the counters used for filling
+INFORMATION_SCHEMA.innodb_cmp_per_index. */
+UNIV_INLINE
+void
+page_zip_reset_stat_per_index();
+/*===========================*/
+
+#include "page0zip.inl"
+#endif /* !UNIV_INNOCHECKSUM */
+
+#endif /* page0zip_h */
diff --git a/storage/innobase/include/page0zip.inl b/storage/innobase/include/page0zip.inl
new file mode 100644
index 00000000..afc877c3
--- /dev/null
+++ b/storage/innobase/include/page0zip.inl
@@ -0,0 +1,317 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0zip.ic
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#include "page0page.h"
+
+/* The format of compressed pages is as follows.
+
+The header and trailer of the uncompressed pages, excluding the page
+directory in the trailer, are copied as is to the header and trailer
+of the compressed page.
+
+At the end of the compressed page, there is a dense page directory
+pointing to every user record contained on the page, including deleted
+records on the free list.  The dense directory is indexed in the
+collation order, i.e., in the order in which the record list is
+linked on the uncompressed page.  The infimum and supremum records are
+excluded.  The two most significant bits of the entries are allocated
+for the delete-mark and an n_owned flag indicating the last record in
+a chain of records pointed to from the sparse page directory on the
+uncompressed page.
+
+The data between PAGE_ZIP_START and the last page directory entry will
+be written in compressed format, starting at offset PAGE_DATA.
+Infimum and supremum records are not stored.  We exclude the
+REC_N_NEW_EXTRA_BYTES in every record header.  These can be recovered
+from the dense page directory stored at the end of the compressed
+page.
+
+The fields node_ptr (in non-leaf B-tree nodes; level>0), trx_id and
+roll_ptr (in leaf B-tree nodes; level=0), and BLOB pointers of
+externally stored columns are stored separately, in ascending order of
+heap_no and column index, starting backwards from the dense page
+directory.
+
+The compressed data stream may be followed by a modification log
+covering the compressed portion of the page, as follows.
+
+MODIFICATION LOG ENTRY FORMAT
+- write record:
+  - (heap_no - 1) << 1 (1..2 bytes)
+  - extra bytes backwards
+  - data bytes
+- clear record:
+  - (heap_no - 1) << 1 | 1 (1..2 bytes)
+
+The integer values are stored in a variable-length format:
+- 0xxxxxxx: 0..127
+- 1xxxxxxx xxxxxxxx: 0..32767
+
+The end of the modification log is marked by a 0 byte.
+
+In summary, the compressed page looks like this:
+
+(1) Uncompressed page header (PAGE_DATA bytes)
+(2) Compressed index information
+(3) Compressed page data
+(4) Page modification log (page_zip->m_start..page_zip->m_end)
+(5) Empty zero-filled space
+(6) BLOB pointers (on leaf pages)
+  - BTR_EXTERN_FIELD_REF_SIZE for each externally stored column
+  - in descending collation order
+(7) Uncompressed columns of user records, n_dense * uncompressed_size bytes,
+  - indexed by heap_no
+  - DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN for leaf pages of clustered indexes
+  - REC_NODE_PTR_SIZE for non-leaf pages
+  - 0 otherwise
+(8) dense page directory, stored backwards
+  - n_dense = n_heap - 2
+  - existing records in ascending collation order
+  - deleted records (free list) in link order
+*/
+
+/**********************************************************************//**
+Determine the size of a compressed page in bytes.
+@return size in bytes */
+UNIV_INLINE
+ulint
+page_zip_get_size(
+/*==============*/
+	const page_zip_des_t*	page_zip)	/*!< in: compressed page */
+{
+	ulint	size;
+
+	if (!page_zip->ssize) {
+		return(0);
+	}
+
+	size = (UNIV_ZIP_SIZE_MIN >> 1) << page_zip->ssize;
+
+	ut_ad(size >= UNIV_ZIP_SIZE_MIN);
+	ut_ad(size <= srv_page_size);
+
+	return(size);
+}
+/**********************************************************************//**
+Set the size of a compressed page in bytes. */
+UNIV_INLINE
+void
+page_zip_set_size(
+/*==============*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	ulint		size)		/*!< in: size in bytes */
+{
+	if (size) {
+		unsigned	ssize;
+
+		ut_ad(ut_is_2pow(size));
+
+		for (ssize = 1; size > (512U << ssize); ssize++) {
+		}
+
+		page_zip->ssize = ssize & ((1U << PAGE_ZIP_SSIZE_BITS) - 1);
+	} else {
+		page_zip->ssize = 0;
+	}
+
+	ut_ad(page_zip_get_size(page_zip) == size);
+}
+
+/** Determine if a record is so big that it needs to be stored externally.
+@param[in]	rec_size	length of the record in bytes
+@param[in]	comp		nonzero=compact format
+@param[in]	n_fields	number of fields in the record; ignored if
+tablespace is not compressed
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@return false if the entire record can be stored locally on the page */
+inline bool page_zip_rec_needs_ext(ulint rec_size, ulint comp, ulint n_fields,
+				   ulint zip_size)
+{
+	/* FIXME: row size check is this function seems to be the most correct.
+	Put it in a separate function and use in more places of InnoDB */
+
+	ut_ad(rec_size
+	      > ulint(comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES));
+	ut_ad(comp || !zip_size);
+
+#if UNIV_PAGE_SIZE_MAX > COMPRESSED_REC_MAX_DATA_SIZE
+	if (comp ? rec_size >= COMPRESSED_REC_MAX_DATA_SIZE :
+		   rec_size >= REDUNDANT_REC_MAX_DATA_SIZE) {
+		return(TRUE);
+	}
+#endif
+
+	if (zip_size) {
+		ut_ad(comp);
+		/* On a compressed page, there is a two-byte entry in
+		the dense page directory for every record.  But there
+		is no record header.  There should be enough room for
+		one record on an empty leaf page.  Subtract 1 byte for
+		the encoded heap number.  Check also the available space
+		on the uncompressed page. */
+		return(rec_size - (REC_N_NEW_EXTRA_BYTES - 2 - 1)
+		       >= page_zip_empty_size(n_fields, zip_size)
+		       || rec_size >= page_get_free_space_of_empty(TRUE) / 2);
+	}
+
+	return(rec_size >= page_get_free_space_of_empty(comp) / 2);
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate a compressed page descriptor.
+@return TRUE if ok */
+UNIV_INLINE
+ibool
+page_zip_simple_validate(
+/*=====================*/
+	const page_zip_des_t*	page_zip)/*!< in: compressed page descriptor */
+{
+	ut_ad(page_zip);
+	ut_ad(page_zip->data);
+	ut_ad(page_zip->ssize <= PAGE_ZIP_SSIZE_MAX);
+	ut_ad(page_zip_get_size(page_zip)
+	      > PAGE_DATA + PAGE_ZIP_DIR_SLOT_SIZE);
+	ut_ad(page_zip->m_start <= page_zip->m_end);
+	ut_ad(page_zip->m_end < page_zip_get_size(page_zip));
+	ut_ad(page_zip->n_blobs
+	      < page_zip_get_size(page_zip) / BTR_EXTERN_FIELD_REF_SIZE);
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Determine if the length of the page trailer.
+@return length of the page trailer, in bytes, not including the
+terminating zero byte of the modification log */
+UNIV_INLINE
+ibool
+page_zip_get_trailer_len(
+/*=====================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	ibool			is_clust)/*!< in: TRUE if clustered index */
+{
+	ulint	uncompressed_size;
+
+	ut_ad(page_zip_simple_validate(page_zip));
+	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+	if (!page_is_leaf(page_zip->data)) {
+		uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
+			+ REC_NODE_PTR_SIZE;
+		ut_ad(!page_zip->n_blobs);
+	} else if (is_clust) {
+		uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
+			+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+	} else {
+		uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE;
+		ut_ad(!page_zip->n_blobs);
+	}
+
+	return (ulint(page_dir_get_n_heap(page_zip->data)) - 2)
+		* uncompressed_size
+		+ ulint(page_zip->n_blobs) * BTR_EXTERN_FIELD_REF_SIZE;
+}
+
+/**********************************************************************//**
+Determine how big record can be inserted without recompressing the page.
+@return a positive number indicating the maximum size of a record
+whose insertion is guaranteed to succeed, or zero or negative */
+UNIV_INLINE
+lint
+page_zip_max_ins_size(
+/*==================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	ibool			is_clust)/*!< in: TRUE if clustered index */
+{
+	ulint	trailer_len;
+
+	trailer_len = page_zip_get_trailer_len(page_zip, is_clust);
+
+	/* When a record is created, a pointer may be added to
+	the dense directory.
+	Likewise, space for the columns that will not be
+	compressed will be allocated from the page trailer.
+	Also the BLOB pointers will be allocated from there, but
+	we may as well count them in the length of the record. */
+
+	trailer_len += PAGE_ZIP_DIR_SLOT_SIZE;
+
+	return(lint(page_zip_get_size(page_zip)
+		    - trailer_len - page_zip->m_end
+		    - (REC_N_NEW_EXTRA_BYTES - 2)));
+}
+
+/**********************************************************************//**
+Determine if enough space is available in the modification log.
+@return TRUE if enough space is available */
+UNIV_INLINE
+ibool
+page_zip_available(
+/*===============*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	ibool			is_clust,/*!< in: TRUE if clustered index */
+	ulint			length,	/*!< in: combined size of the record */
+	ulint			create)	/*!< in: nonzero=add the record to
+					the heap */
+{
+	ulint	trailer_len;
+
+	ut_ad(length > REC_N_NEW_EXTRA_BYTES);
+
+	trailer_len = page_zip_get_trailer_len(page_zip, is_clust);
+
+	/* Subtract the fixed extra bytes and add the maximum
+	space needed for identifying the record (encoded heap_no). */
+	length -= REC_N_NEW_EXTRA_BYTES - 2;
+
+	if (create > 0) {
+		/* When a record is created, a pointer may be added to
+		the dense directory.
+		Likewise, space for the columns that will not be
+		compressed will be allocated from the page trailer.
+		Also the BLOB pointers will be allocated from there, but
+		we may as well count them in the length of the record. */
+
+		trailer_len += PAGE_ZIP_DIR_SLOT_SIZE;
+	}
+
+	return(length + trailer_len + page_zip->m_end
+	       < page_zip_get_size(page_zip));
+}
+
+/**********************************************************************//**
+Reset the counters used for filling
+INFORMATION_SCHEMA.innodb_cmp_per_index. */
+UNIV_INLINE
+void
+page_zip_reset_stat_per_index()
+/*===========================*/
+{
+	mysql_mutex_lock(&page_zip_stat_per_index_mutex);
+	page_zip_stat_per_index.clear();
+	mysql_mutex_unlock(&page_zip_stat_per_index_mutex);
+}
diff --git a/storage/innobase/include/pars0grm.h b/storage/innobase/include/pars0grm.h
new file mode 100644
index 00000000..e7112d99
--- /dev/null
+++ b/storage/innobase/include/pars0grm.h
@@ -0,0 +1,151 @@
+/* A Bison parser, made by GNU Bison 3.7.6.  */
+
+/* Bison interface for Yacc-like parsers in C
+
+   Copyright (C) 1984, 1989-1990, 2000-2015, 2018-2021 Free Software Foundation,
+   Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+/* As a special exception, you may create a larger work that contains
+   part or all of the Bison parser skeleton and distribute that work
+   under terms of your choice, so long as that work isn't itself a
+   parser generator using the skeleton or a modified version thereof
+   as a parser skeleton.  Alternatively, if you modify or redistribute
+   the parser skeleton itself, you may (at your option) remove this
+   special exception, which will cause the skeleton and the resulting
+   Bison output files to be licensed under the GNU General Public
+   License without this special exception.
+
+   This special exception was added by the Free Software Foundation in
+   version 2.2 of Bison.  */
+
+/* DO NOT RELY ON FEATURES THAT ARE NOT DOCUMENTED in the manual,
+   especially those whose name start with YY_ or yy_.  They are
+   private implementation details that can be changed or removed.  */
+
+#ifndef YY_YY_PARS0GRM_TAB_H_INCLUDED
+# define YY_YY_PARS0GRM_TAB_H_INCLUDED
+/* Debug traces.  */
+#ifndef YYDEBUG
+# define YYDEBUG 0
+#endif
+#if YYDEBUG
+extern int yydebug;
+#endif
+
+/* Token kinds.  */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+  enum yytokentype
+  {
+    YYEMPTY = -2,
+    YYEOF = 0,                     /* "end of file"  */
+    YYerror = 256,                 /* error  */
+    YYUNDEF = 257,                 /* "invalid token"  */
+    PARS_INT_LIT = 258,            /* PARS_INT_LIT  */
+    PARS_FLOAT_LIT = 259,          /* PARS_FLOAT_LIT  */
+    PARS_STR_LIT = 260,            /* PARS_STR_LIT  */
+    PARS_NULL_LIT = 261,           /* PARS_NULL_LIT  */
+    PARS_ID_TOKEN = 262,           /* PARS_ID_TOKEN  */
+    PARS_AND_TOKEN = 263,          /* PARS_AND_TOKEN  */
+    PARS_OR_TOKEN = 264,           /* PARS_OR_TOKEN  */
+    PARS_NOT_TOKEN = 265,          /* PARS_NOT_TOKEN  */
+    PARS_GE_TOKEN = 266,           /* PARS_GE_TOKEN  */
+    PARS_LE_TOKEN = 267,           /* PARS_LE_TOKEN  */
+    PARS_NE_TOKEN = 268,           /* PARS_NE_TOKEN  */
+    PARS_PROCEDURE_TOKEN = 269,    /* PARS_PROCEDURE_TOKEN  */
+    PARS_IN_TOKEN = 270,           /* PARS_IN_TOKEN  */
+    PARS_INT_TOKEN = 271,          /* PARS_INT_TOKEN  */
+    PARS_CHAR_TOKEN = 272,         /* PARS_CHAR_TOKEN  */
+    PARS_IS_TOKEN = 273,           /* PARS_IS_TOKEN  */
+    PARS_BEGIN_TOKEN = 274,        /* PARS_BEGIN_TOKEN  */
+    PARS_END_TOKEN = 275,          /* PARS_END_TOKEN  */
+    PARS_IF_TOKEN = 276,           /* PARS_IF_TOKEN  */
+    PARS_THEN_TOKEN = 277,         /* PARS_THEN_TOKEN  */
+    PARS_ELSE_TOKEN = 278,         /* PARS_ELSE_TOKEN  */
+    PARS_ELSIF_TOKEN = 279,        /* PARS_ELSIF_TOKEN  */
+    PARS_LOOP_TOKEN = 280,         /* PARS_LOOP_TOKEN  */
+    PARS_WHILE_TOKEN = 281,        /* PARS_WHILE_TOKEN  */
+    PARS_RETURN_TOKEN = 282,       /* PARS_RETURN_TOKEN  */
+    PARS_SELECT_TOKEN = 283,       /* PARS_SELECT_TOKEN  */
+    PARS_COUNT_TOKEN = 284,        /* PARS_COUNT_TOKEN  */
+    PARS_FROM_TOKEN = 285,         /* PARS_FROM_TOKEN  */
+    PARS_WHERE_TOKEN = 286,        /* PARS_WHERE_TOKEN  */
+    PARS_FOR_TOKEN = 287,          /* PARS_FOR_TOKEN  */
+    PARS_DDOT_TOKEN = 288,         /* PARS_DDOT_TOKEN  */
+    PARS_ORDER_TOKEN = 289,        /* PARS_ORDER_TOKEN  */
+    PARS_BY_TOKEN = 290,           /* PARS_BY_TOKEN  */
+    PARS_ASC_TOKEN = 291,          /* PARS_ASC_TOKEN  */
+    PARS_DESC_TOKEN = 292,         /* PARS_DESC_TOKEN  */
+    PARS_INSERT_TOKEN = 293,       /* PARS_INSERT_TOKEN  */
+    PARS_INTO_TOKEN = 294,         /* PARS_INTO_TOKEN  */
+    PARS_VALUES_TOKEN = 295,       /* PARS_VALUES_TOKEN  */
+    PARS_UPDATE_TOKEN = 296,       /* PARS_UPDATE_TOKEN  */
+    PARS_SET_TOKEN = 297,          /* PARS_SET_TOKEN  */
+    PARS_DELETE_TOKEN = 298,       /* PARS_DELETE_TOKEN  */
+    PARS_CURRENT_TOKEN = 299,      /* PARS_CURRENT_TOKEN  */
+    PARS_OF_TOKEN = 300,           /* PARS_OF_TOKEN  */
+    PARS_CREATE_TOKEN = 301,       /* PARS_CREATE_TOKEN  */
+    PARS_TABLE_TOKEN = 302,        /* PARS_TABLE_TOKEN  */
+    PARS_INDEX_TOKEN = 303,        /* PARS_INDEX_TOKEN  */
+    PARS_UNIQUE_TOKEN = 304,       /* PARS_UNIQUE_TOKEN  */
+    PARS_CLUSTERED_TOKEN = 305,    /* PARS_CLUSTERED_TOKEN  */
+    PARS_ON_TOKEN = 306,           /* PARS_ON_TOKEN  */
+    PARS_ASSIGN_TOKEN = 307,       /* PARS_ASSIGN_TOKEN  */
+    PARS_DECLARE_TOKEN = 308,      /* PARS_DECLARE_TOKEN  */
+    PARS_CURSOR_TOKEN = 309,       /* PARS_CURSOR_TOKEN  */
+    PARS_SQL_TOKEN = 310,          /* PARS_SQL_TOKEN  */
+    PARS_OPEN_TOKEN = 311,         /* PARS_OPEN_TOKEN  */
+    PARS_FETCH_TOKEN = 312,        /* PARS_FETCH_TOKEN  */
+    PARS_CLOSE_TOKEN = 313,        /* PARS_CLOSE_TOKEN  */
+    PARS_NOTFOUND_TOKEN = 314,     /* PARS_NOTFOUND_TOKEN  */
+    PARS_TO_BINARY_TOKEN = 315,    /* PARS_TO_BINARY_TOKEN  */
+    PARS_SUBSTR_TOKEN = 316,       /* PARS_SUBSTR_TOKEN  */
+    PARS_CONCAT_TOKEN = 317,       /* PARS_CONCAT_TOKEN  */
+    PARS_INSTR_TOKEN = 318,        /* PARS_INSTR_TOKEN  */
+    PARS_LENGTH_TOKEN = 319,       /* PARS_LENGTH_TOKEN  */
+    PARS_COMMIT_TOKEN = 320,       /* PARS_COMMIT_TOKEN  */
+    PARS_ROLLBACK_TOKEN = 321,     /* PARS_ROLLBACK_TOKEN  */
+    PARS_WORK_TOKEN = 322,         /* PARS_WORK_TOKEN  */
+    PARS_EXIT_TOKEN = 323,         /* PARS_EXIT_TOKEN  */
+    PARS_FUNCTION_TOKEN = 324,     /* PARS_FUNCTION_TOKEN  */
+    PARS_LOCK_TOKEN = 325,         /* PARS_LOCK_TOKEN  */
+    PARS_SHARE_TOKEN = 326,        /* PARS_SHARE_TOKEN  */
+    PARS_MODE_TOKEN = 327,         /* PARS_MODE_TOKEN  */
+    PARS_LIKE_TOKEN = 328,         /* PARS_LIKE_TOKEN  */
+    PARS_LIKE_TOKEN_EXACT = 329,   /* PARS_LIKE_TOKEN_EXACT  */
+    PARS_LIKE_TOKEN_PREFIX = 330,  /* PARS_LIKE_TOKEN_PREFIX  */
+    PARS_LIKE_TOKEN_SUFFIX = 331,  /* PARS_LIKE_TOKEN_SUFFIX  */
+    PARS_LIKE_TOKEN_SUBSTR = 332,  /* PARS_LIKE_TOKEN_SUBSTR  */
+    PARS_TABLE_NAME_TOKEN = 333,   /* PARS_TABLE_NAME_TOKEN  */
+    PARS_BIGINT_TOKEN = 334,       /* PARS_BIGINT_TOKEN  */
+    NEG = 335                      /* NEG  */
+  };
+  typedef enum yytokentype yytoken_kind_t;
+#endif
+
+/* Value type.  */
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef int YYSTYPE;
+# define YYSTYPE_IS_TRIVIAL 1
+# define YYSTYPE_IS_DECLARED 1
+#endif
+
+
+extern YYSTYPE yylval;
+
+int yyparse (void);
+
+#endif /* !YY_YY_PARS0GRM_TAB_H_INCLUDED  */
diff --git a/storage/innobase/include/pars0opt.h b/storage/innobase/include/pars0opt.h
new file mode 100644
index 00000000..07a726ea
--- /dev/null
+++ b/storage/innobase/include/pars0opt.h
@@ -0,0 +1,68 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2018, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0opt.h
+Simple SQL optimizer
+
+Created 12/21/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0opt_h
+#define pars0opt_h
+
+#include "que0types.h"
+#include "pars0sym.h"
+#include "row0sel.h"
+
+/*******************************************************************//**
+Optimizes a select. Decides which indexes to tables to use. The tables
+are accessed in the order that they were written to the FROM part in the
+select statement. */
+void
+opt_search_plan(
+/*============*/
+	sel_node_t*	sel_node);	/*!< in: parsed select node */
+/*******************************************************************//**
+Looks for occurrences of the columns of the table in the query subgraph and
+adds them to the list of columns if an occurrence of the same column does not
+already exist in the list. If the column is already in the list, puts a value
+indirection to point to the occurrence in the column list, except if the
+column occurrence we are looking at is in the column list, in which case
+nothing is done. */
+void
+opt_find_all_cols(
+/*==============*/
+	ibool		copy_val,	/*!< in: if TRUE, new found columns are
+					added as columns to copy */
+	dict_index_t*	index,		/*!< in: index to use */
+	sym_node_list_t* col_list,	/*!< in: base node of a list where
+					to add new found columns */
+	plan_t*		plan,		/*!< in: plan or NULL */
+	que_node_t*	exp);		/*!< in: expression or condition */
+#ifdef UNIV_SQL_DEBUG
+/********************************************************************//**
+Prints info of a query plan. */
+void
+opt_print_query_plan(
+/*=================*/
+	sel_node_t*	sel_node);	/*!< in: select node */
+#endif /* UNIV_SQL_DEBUG */
+
+#endif
diff --git a/storage/innobase/include/pars0pars.h b/storage/innobase/include/pars0pars.h
new file mode 100644
index 00000000..16823ce1
--- /dev/null
+++ b/storage/innobase/include/pars0pars.h
@@ -0,0 +1,695 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0pars.h
+SQL parser
+
+Created 11/19/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0pars_h
+#define pars0pars_h
+
+#include "que0types.h"
+#include "pars0types.h"
+#include "row0types.h"
+#include "trx0types.h"
+#include "ut0vec.h"
+#include "row0mysql.h"
+
+/** Type of the user functions. The first argument is always InnoDB-supplied
+and varies in type, while 'user_arg' is a user-supplied argument. The
+meaning of the return type also varies. See the individual use cases, e.g.
+the FETCH statement, for details on them. */
+typedef ibool	(*pars_user_func_cb_t)(void* arg, void* user_arg);
+
+/** If the following is set TRUE, the parser will emit debugging
+information */
+extern int	yydebug;
+
+/* Global variable used while parsing a single procedure or query : the code is
+NOT re-entrant */
+extern sym_tab_t*	pars_sym_tab_global;
+
+extern pars_res_word_t	pars_to_binary_token;
+extern pars_res_word_t	pars_substr_token;
+extern pars_res_word_t	pars_concat_token;
+extern pars_res_word_t	pars_length_token;
+extern pars_res_word_t	pars_instr_token;
+extern pars_res_word_t	pars_count_token;
+extern pars_res_word_t	pars_int_token;
+extern pars_res_word_t	pars_bigint_token;
+extern pars_res_word_t	pars_char_token;
+extern pars_res_word_t	pars_update_token;
+extern pars_res_word_t	pars_asc_token;
+extern pars_res_word_t	pars_desc_token;
+extern pars_res_word_t	pars_open_token;
+extern pars_res_word_t	pars_close_token;
+extern pars_res_word_t	pars_share_token;
+extern pars_res_word_t	pars_unique_token;
+extern pars_res_word_t	pars_clustered_token;
+
+extern ulint		pars_star_denoter;
+
+/* Procedure parameter types */
+#define PARS_INPUT	0
+#define PARS_OUTPUT	1
+#define PARS_NOT_PARAM	2
+
+int
+yyparse(void);
+
+/*************************************************************//**
+Parses an SQL string returning the query graph.
+@return own: the query graph */
+que_t*
+pars_sql(
+/*=====*/
+	pars_info_t*	info,	/*!< in: extra information, or NULL */
+	const char*	str);	/*!< in: SQL string */
+/*************************************************************//**
+Retrieves characters to the lexical analyzer.
+@return number of characters copied or 0 on EOF */
+int
+pars_get_lex_chars(
+/*===============*/
+	char*	buf,		/*!< in/out: buffer where to copy */
+	size_t	max_size);	/*!< in: maximum number of characters which fit
+				in the buffer */
+/*************************************************************//**
+Called by yyparse on error. */
+void
+yyerror(
+/*====*/
+	const char*	s);	/*!< in: error message string */
+/*********************************************************************//**
+Parses a variable declaration.
+@return own: symbol table node of type SYM_VAR */
+sym_node_t*
+pars_variable_declaration(
+/*======================*/
+	sym_node_t*	node,	/*!< in: symbol table node allocated for the
+				id of the variable */
+	pars_res_word_t* type);	/*!< in: pointer to a type token */
+/*********************************************************************//**
+Parses a function expression.
+@return own: function node in a query tree */
+func_node_t*
+pars_func(
+/*======*/
+	que_node_t*	res_word,/*!< in: function name reserved word */
+	que_node_t*	arg);	/*!< in: first argument in the argument list */
+/*************************************************************************
+Rebind a LIKE search string. NOTE: We ignore any '%' characters embedded
+within the search string.
+@return own: function node in a query tree */
+int
+pars_like_rebind(
+/*=============*/
+        sym_node_t*     node,   /* in: The search string node.*/
+        const byte*     ptr,    /* in: literal to (re) bind */
+        ulint           len);   /* in: length of literal to (re) bind*/
+/*********************************************************************//**
+Parses an operator expression.
+@return own: function node in a query tree */
+func_node_t*
+pars_op(
+/*====*/
+	int		func,	/*!< in: operator token code */
+	que_node_t*	arg1,	/*!< in: first argument */
+	que_node_t*	arg2);	/*!< in: second argument or NULL for an unary
+				operator */
+/*********************************************************************//**
+Parses an ORDER BY clause. Order by a single column only is supported.
+@return own: order-by node in a query tree */
+order_node_t*
+pars_order_by(
+/*==========*/
+	sym_node_t*	column,	/*!< in: column name */
+	pars_res_word_t* asc);	/*!< in: &pars_asc_token or pars_desc_token */
+/*********************************************************************//**
+Parses a select list; creates a query graph node for the whole SELECT
+statement.
+@return own: select node in a query tree */
+sel_node_t*
+pars_select_list(
+/*=============*/
+	que_node_t*	select_list,	/*!< in: select list */
+	sym_node_t*	into_list);	/*!< in: variables list or NULL */
+/*********************************************************************//**
+Parses a cursor declaration.
+@return sym_node */
+que_node_t*
+pars_cursor_declaration(
+/*====================*/
+	sym_node_t*	sym_node,	/*!< in: cursor id node in the symbol
+					table */
+	sel_node_t*	select_node);	/*!< in: select node */
+/*********************************************************************//**
+Parses a function declaration.
+@return sym_node */
+que_node_t*
+pars_function_declaration(
+/*======================*/
+	sym_node_t*	sym_node);	/*!< in: function id node in the symbol
+					table */
+/*********************************************************************//**
+Parses a select statement.
+@return own: select node in a query tree */
+sel_node_t*
+pars_select_statement(
+/*==================*/
+	sel_node_t*	select_node,	/*!< in: select node already containing
+					the select list */
+	sym_node_t*	table_list,	/*!< in: table list */
+	que_node_t*	search_cond,	/*!< in: search condition or NULL */
+	pars_res_word_t* for_update,	/*!< in: NULL or &pars_update_token */
+	pars_res_word_t* consistent_read,/*!< in: NULL or
+						&pars_consistent_token */
+	order_node_t*	order_by);	/*!< in: NULL or an order-by node */
+/*********************************************************************//**
+Parses a column assignment in an update.
+@return column assignment node */
+col_assign_node_t*
+pars_column_assignment(
+/*===================*/
+	sym_node_t*	column,	/*!< in: column to assign */
+	que_node_t*	exp);	/*!< in: value to assign */
+/*********************************************************************//**
+Parses a delete or update statement start.
+@return own: update node in a query tree */
+upd_node_t*
+pars_update_statement_start(
+/*========================*/
+	ibool		is_delete,	/*!< in: TRUE if delete */
+	sym_node_t*	table_sym,	/*!< in: table name node */
+	col_assign_node_t* col_assign_list);/*!< in: column assignment list, NULL
+					if delete */
+/*********************************************************************//**
+Parses an update or delete statement.
+@return own: update node in a query tree */
+upd_node_t*
+pars_update_statement(
+/*==================*/
+	upd_node_t*	node,		/*!< in: update node */
+	sym_node_t*	cursor_sym,	/*!< in: pointer to a cursor entry in
+					the symbol table or NULL */
+	que_node_t*	search_cond);	/*!< in: search condition or NULL */
+/*********************************************************************//**
+Parses an insert statement.
+@return own: update node in a query tree */
+ins_node_t*
+pars_insert_statement(
+/*==================*/
+	sym_node_t*	table_sym,	/*!< in: table name node */
+	que_node_t*	values_list,	/*!< in: value expression list or NULL */
+	sel_node_t*	select);	/*!< in: select condition or NULL */
+/*********************************************************************//**
+Parses an elsif element.
+@return elsif node */
+elsif_node_t*
+pars_elsif_element(
+/*===============*/
+	que_node_t*	cond,		/*!< in: if-condition */
+	que_node_t*	stat_list);	/*!< in: statement list */
+/*********************************************************************//**
+Parses an if-statement.
+@return if-statement node */
+if_node_t*
+pars_if_statement(
+/*==============*/
+	que_node_t*	cond,		/*!< in: if-condition */
+	que_node_t*	stat_list,	/*!< in: statement list */
+	que_node_t*	else_part);	/*!< in: else-part statement list */
+/*********************************************************************//**
+Parses a for-loop-statement.
+@return for-statement node */
+for_node_t*
+pars_for_statement(
+/*===============*/
+	sym_node_t*	loop_var,	/*!< in: loop variable */
+	que_node_t*	loop_start_limit,/*!< in: loop start expression */
+	que_node_t*	loop_end_limit,	/*!< in: loop end expression */
+	que_node_t*	stat_list);	/*!< in: statement list */
+/*********************************************************************//**
+Parses a while-statement.
+@return while-statement node */
+while_node_t*
+pars_while_statement(
+/*=================*/
+	que_node_t*	cond,		/*!< in: while-condition */
+	que_node_t*	stat_list);	/*!< in: statement list */
+/*********************************************************************//**
+Parses an exit statement.
+@return exit statement node */
+exit_node_t*
+pars_exit_statement(void);
+/*=====================*/
+/*********************************************************************//**
+Parses a return-statement.
+@return return-statement node */
+return_node_t*
+pars_return_statement(void);
+/*=======================*/
+/*********************************************************************//**
+Parses a procedure call.
+@return function node */
+func_node_t*
+pars_procedure_call(
+/*================*/
+	que_node_t*	res_word,/*!< in: procedure name reserved word */
+	que_node_t*	args);	/*!< in: argument list */
+/*********************************************************************//**
+Parses an assignment statement.
+@return assignment statement node */
+assign_node_t*
+pars_assignment_statement(
+/*======================*/
+	sym_node_t*	var,	/*!< in: variable to assign */
+	que_node_t*	val);	/*!< in: value to assign */
+/*********************************************************************//**
+Parses a fetch statement. into_list or user_func (but not both) must be
+non-NULL.
+@return fetch statement node */
+fetch_node_t*
+pars_fetch_statement(
+/*=================*/
+	sym_node_t*	cursor,		/*!< in: cursor node */
+	sym_node_t*	into_list,	/*!< in: variables to set, or NULL */
+	sym_node_t*	user_func);	/*!< in: user function name, or NULL */
+/*********************************************************************//**
+Parses an open or close cursor statement.
+@return fetch statement node */
+open_node_t*
+pars_open_statement(
+/*================*/
+	ulint		type,	/*!< in: ROW_SEL_OPEN_CURSOR
+				or ROW_SEL_CLOSE_CURSOR */
+	sym_node_t*	cursor);	/*!< in: cursor node */
+/*********************************************************************//**
+Parses a row_printf-statement.
+@return row_printf-statement node */
+row_printf_node_t*
+pars_row_printf_statement(
+/*======================*/
+	sel_node_t*	sel_node);	/*!< in: select node */
+/*********************************************************************//**
+Parses a commit statement.
+@return own: commit node struct */
+commit_node_t*
+pars_commit_statement(void);
+/*=======================*/
+/*********************************************************************//**
+Parses a rollback statement.
+@return own: rollback node struct */
+roll_node_t*
+pars_rollback_statement(void);
+/*=========================*/
+/*********************************************************************//**
+Parses a column definition at a table creation.
+@return column sym table node */
+sym_node_t*
+pars_column_def(
+/*============*/
+	sym_node_t*		sym_node,	/*!< in: column node in the
+						symbol table */
+	pars_res_word_t*	type,		/*!< in: data type */
+	sym_node_t*		len,		/*!< in: length of column, or
+						NULL */
+	void*			is_not_null);	/*!< in: if not NULL, column
+						is of type NOT NULL. */
+/*********************************************************************//**
+Parses a table creation operation.
+@return table create subgraph */
+tab_node_t*
+pars_create_table(
+/*==============*/
+	sym_node_t*	table_sym,	/*!< in: table name node in the symbol
+					table */
+	sym_node_t*	column_defs);	/*!< in: list of column names */
+/*********************************************************************//**
+Parses an index creation operation.
+@return index create subgraph */
+ind_node_t*
+pars_create_index(
+/*==============*/
+	pars_res_word_t* unique_def,	/*!< in: not NULL if a unique index */
+	pars_res_word_t* clustered_def,	/*!< in: not NULL if a clustered index */
+	sym_node_t*	index_sym,	/*!< in: index name node in the symbol
+					table */
+	sym_node_t*	table_sym,	/*!< in: table name node in the symbol
+					table */
+	sym_node_t*	column_list);	/*!< in: list of column names */
+/*********************************************************************//**
+Parses a procedure definition.
+@return query fork node */
+que_fork_t*
+pars_procedure_definition(
+/*======================*/
+	sym_node_t*	sym_node,	/*!< in: procedure id node in the symbol
+					table */
+	que_node_t*	stat_list);	/*!< in: statement list */
+
+/** Completes a query graph by adding query thread and fork nodes
+above it and prepares the graph for running.
+@param[in]	node		root node for an incomplete query
+				graph, or NULL for dummy graph
+@param[in]	trx		transaction handle
+@param[in]	heap		memory heap from which allocated
+@param[in]	prebuilt	row prebuilt structure
+@return query thread node to run */
+que_thr_t*
+pars_complete_graph_for_exec(
+	que_node_t*	node,
+	trx_t*		trx,
+	mem_heap_t*	heap,
+	row_prebuilt_t*	prebuilt)
+	MY_ATTRIBUTE((nonnull(2,3), warn_unused_result));
+
+/****************************************************************//**
+Create parser info struct.
+@return own: info struct */
+pars_info_t*
+pars_info_create(void);
+/*==================*/
+
+/****************************************************************//**
+Add bound literal. */
+void
+pars_info_add_literal(
+/*==================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const void*	address,	/*!< in: address */
+	ulint		length,		/*!< in: length of data */
+	ulint		type,		/*!< in: type, e.g. DATA_FIXBINARY */
+	ulint		prtype);	/*!< in: precise type, e.g.
+					DATA_UNSIGNED */
+
+/****************************************************************//**
+Equivalent to pars_info_add_literal(info, name, str, strlen(str),
+DATA_VARCHAR, DATA_ENGLISH). */
+void
+pars_info_add_str_literal(
+/*======================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const char*	str);		/*!< in: string */
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+void
+pars_info_bind_literal(
+/*===================*/
+	pars_info_t*	info,		/* in: info struct */
+	const char*	name,		/* in: name */
+	const void*	address,	/* in: address */
+	ulint		length,		/* in: length of data */
+	ulint		type,		/* in: type, e.g. DATA_FIXBINARY */
+	ulint		prtype);	/* in: precise type, e.g. */
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+void
+pars_info_bind_varchar_literal(
+/*===========================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const byte*	str,		/*!< in: string */
+	ulint		str_len);	/*!< in: string length */
+/****************************************************************//**
+Equivalent to:
+
+char buf[4];
+mach_write_to_4(buf, val);
+pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+void
+pars_info_bind_int4_literal(
+/*=======================*/
+	pars_info_t*		info,		/*!< in: info struct */
+	const char*		name,		/*!< in: name */
+	const ib_uint32_t*	val);		/*!< in: value */
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+void
+pars_info_bind_int8_literal(
+/*=======================*/
+	pars_info_t*		info,		/*!< in: info struct */
+	const char*		name,		/*!< in: name */
+	const ib_uint64_t*	val);		/*!< in: value */
+/****************************************************************//**
+Add user function. */
+void
+pars_info_bind_function(
+/*===================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name,	/*!< in: function name */
+	pars_user_func_cb_t	func,	/*!< in: function address */
+	void*			arg);	/*!< in: user-supplied argument */
+/****************************************************************//**
+Add bound id. */
+void
+pars_info_bind_id(
+/*=============*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name,	/*!< in: name */
+	const char*		id);	/*!< in: id */
+/****************************************************************//**
+Equivalent to:
+
+char buf[4];
+mach_write_to_4(buf, val);
+pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+void
+pars_info_add_int4_literal(
+/*=======================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	ulint		val);		/*!< in: value */
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[8];
+mach_write_to_8(buf, val);
+pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+void
+pars_info_add_ull_literal(
+/*======================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	ib_uint64_t	val);		/*!< in: value */
+
+/****************************************************************//**
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+void
+pars_info_bind_ull_literal(
+/*=======================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name,	/*!< in: name */
+	const ib_uint64_t*	val)	/*!< in: value */
+	MY_ATTRIBUTE((nonnull));
+
+/****************************************************************//**
+Get bound literal with the given name.
+@return bound literal, or NULL if not found */
+pars_bound_lit_t*
+pars_info_get_bound_lit(
+/*====================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name);	/*!< in: bound literal name to find */
+
+/****************************************************************//**
+Get bound id with the given name.
+@return bound id, or NULL if not found */
+pars_bound_id_t*
+pars_info_get_bound_id(
+/*===================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name);	/*!< in: bound id name to find */
+
+/******************************************************************//**
+Release any resources used by the lexer. */
+void
+pars_lexer_close(void);
+/*==================*/
+
+/** Extra information supplied for pars_sql(). */
+struct pars_info_t {
+	mem_heap_t*	heap;		/*!< our own memory heap */
+
+	ib_vector_t*	funcs;		/*!< user functions, or NUll
+					(pars_user_func_t*) */
+	ib_vector_t*	bound_lits;	/*!< bound literals, or NULL
+					(pars_bound_lit_t*) */
+	ib_vector_t*	bound_ids;	/*!< bound ids, or NULL
+					(pars_bound_id_t*) */
+};
+
+inline void pars_info_free(pars_info_t *info) { mem_heap_free(info->heap); }
+
+/** User-supplied function and argument. */
+struct pars_user_func_t {
+	const char*		name;	/*!< function name */
+	pars_user_func_cb_t	func;	/*!< function address */
+	void*			arg;	/*!< user-supplied argument */
+};
+
+/** Bound literal. */
+struct pars_bound_lit_t {
+	const char*	name;		/*!< name */
+	const void*	address;	/*!< address */
+	ulint		length;		/*!< length of data */
+	ulint		type;		/*!< type, e.g. DATA_FIXBINARY */
+	ulint		prtype;		/*!< precise type, e.g. DATA_UNSIGNED */
+	sym_node_t*	node;		/*!< symbol node */
+};
+
+/** Bound identifier. */
+struct pars_bound_id_t {
+	const char*	name;		/*!< name */
+	const char*	id;		/*!< identifier */
+};
+
+/** Struct used to denote a reserved word in a parsing tree */
+struct pars_res_word_t{
+	int	code;	/*!< the token code for the reserved word from
+			pars0grm.h */
+};
+
+/** A predefined function or operator node in a parsing tree; this construct
+is also used for some non-functions like the assignment ':=' */
+struct func_node_t{
+	que_common_t	common;	/*!< type: QUE_NODE_FUNC */
+	int		func;	/*!< token code of the function name */
+	ulint		fclass;	/*!< class of the function */
+	que_node_t*	args;	/*!< argument(s) of the function */
+	UT_LIST_NODE_T(func_node_t) cond_list;
+				/*!< list of comparison conditions; defined
+				only for comparison operator nodes except,
+				presently, for OPT_SCROLL_TYPE ones */
+	UT_LIST_NODE_T(func_node_t) func_node_list;
+				/*!< list of function nodes in a parsed
+				query graph */
+};
+
+/** An order-by node in a select */
+struct order_node_t{
+	que_common_t	common;	/*!< type: QUE_NODE_ORDER */
+	sym_node_t*	column;	/*!< order-by column */
+	ibool		asc;	/*!< TRUE if ascending, FALSE if descending */
+};
+
+/** Procedure definition node */
+struct proc_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_PROC */
+	sym_node_t*	proc_id;	/*!< procedure name symbol in the symbol
+					table of this same procedure */
+	que_node_t*	stat_list;	/*!< statement list */
+	sym_tab_t*	sym_tab;	/*!< symbol table of this procedure */
+};
+
+/** elsif-element node */
+struct elsif_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_ELSIF */
+	que_node_t*	cond;		/*!< if condition */
+	que_node_t*	stat_list;	/*!< statement list */
+};
+
+/** if-statement node */
+struct if_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_IF */
+	que_node_t*	cond;		/*!< if condition */
+	que_node_t*	stat_list;	/*!< statement list */
+	que_node_t*	else_part;	/*!< else-part statement list */
+	elsif_node_t*	elsif_list;	/*!< elsif element list */
+};
+
+/** while-statement node */
+struct while_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_WHILE */
+	que_node_t*	cond;		/*!< while condition */
+	que_node_t*	stat_list;	/*!< statement list */
+};
+
+/** for-loop-statement node */
+struct for_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_FOR */
+	sym_node_t*	loop_var;	/*!< loop variable: this is the
+					dereferenced symbol from the
+					variable declarations, not the
+					symbol occurrence in the for loop
+					definition */
+	que_node_t*	loop_start_limit;/*!< initial value of loop variable */
+	que_node_t*	loop_end_limit;	/*!< end value of loop variable */
+	lint		loop_end_value;	/*!< evaluated value for the end value:
+					it is calculated only when the loop
+					is entered, and will not change within
+					the loop */
+	que_node_t*	stat_list;	/*!< statement list */
+};
+
+/** exit statement node */
+struct exit_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_EXIT */
+};
+
+/** return-statement node */
+struct return_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_RETURN */
+};
+
+/** Assignment statement node */
+struct assign_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_ASSIGNMENT */
+	sym_node_t*	var;		/*!< variable to set */
+	que_node_t*	val;		/*!< value to assign */
+};
+
+/** Column assignment node */
+struct col_assign_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_COL_ASSIGN */
+	sym_node_t*	col;		/*!< column to set */
+	que_node_t*	val;		/*!< value to assign */
+};
+
+/** Classes of functions */
+/* @{ */
+#define PARS_FUNC_ARITH		1	/*!< +, -, *, / */
+#define	PARS_FUNC_LOGICAL	2	/*!< AND, OR, NOT */
+#define PARS_FUNC_CMP		3	/*!< comparison operators */
+#define	PARS_FUNC_PREDEFINED	4	/*!< TO_NUMBER, SUBSTR, ... */
+#define	PARS_FUNC_AGGREGATE	5	/*!< COUNT */
+#define	PARS_FUNC_OTHER		6	/*!< these are not real functions,
+					e.g., := */
+/* @} */
+
+#endif
diff --git a/storage/innobase/include/pars0sym.h b/storage/innobase/include/pars0sym.h
new file mode 100644
index 00000000..59f6cc31
--- /dev/null
+++ b/storage/innobase/include/pars0sym.h
@@ -0,0 +1,243 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0sym.h
+SQL parser symbol table
+
+Created 12/15/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0sym_h
+#define pars0sym_h
+
+#include "que0types.h"
+#include "pars0types.h"
+#include "row0types.h"
+
+/******************************************************************//**
+Creates a symbol table for a single stored procedure or query.
+@return own: symbol table */
+sym_tab_t*
+sym_tab_create(
+/*===========*/
+	mem_heap_t*	heap);	/*!< in: memory heap where to create */
+/******************************************************************//**
+Frees the memory allocated dynamically AFTER parsing phase for variables
+etc. in the symbol table. Does not free the mem heap where the table was
+originally created. Frees also SQL explicit cursor definitions. */
+void
+sym_tab_free_private(
+/*=================*/
+	sym_tab_t*	sym_tab);	/*!< in, own: symbol table */
+/******************************************************************//**
+Adds an integer literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_int_lit(
+/*================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	ulint		val);		/*!< in: integer value */
+/******************************************************************//**
+Adds an string literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_str_lit(
+/*================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	const byte*	str,		/*!< in: string with no quotes around
+					it */
+	ulint		len);		/*!< in: string length */
+/******************************************************************//**
+Add a bound literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_bound_lit(
+/*==================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	const char*	name,		/*!< in: name of bound literal */
+	ulint*		lit_type);	/*!< out: type of literal (PARS_*_LIT) */
+/**********************************************************************
+Rebind literal to a node in the symbol table. */
+sym_node_t*
+sym_tab_rebind_lit(
+/*===============*/
+                                        /* out: symbol table node */
+        sym_node_t*     node,           /* in: node that is bound to literal*/
+        const void*     address,        /* in: pointer to data */
+        ulint           length);        /* in: length of data */
+/******************************************************************//**
+Adds an SQL null literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_null_lit(
+/*=================*/
+	sym_tab_t*	sym_tab);	/*!< in: symbol table */
+/******************************************************************//**
+Adds an identifier to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_id(
+/*===========*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	byte*		name,		/*!< in: identifier name */
+	ulint		len);		/*!< in: identifier length */
+
+/******************************************************************//**
+Add a bound identifier to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_bound_id(
+/*===========*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	const char*	name);		/*!< in: name of bound id */
+
+/** Index of sym_node_t::field_nos corresponding to the clustered index */
+#define	SYM_CLUST_FIELD_NO	0
+/** Index of sym_node_t::field_nos corresponding to a secondary index */
+#define	SYM_SEC_FIELD_NO	1
+
+/** Types of a symbol table node */
+enum sym_tab_entry {
+	SYM_UNSET,		/*!< Unset entry. */
+	SYM_VAR = 91,		/*!< declared parameter or local
+				variable of a procedure */
+	SYM_IMPLICIT_VAR,	/*!< storage for a intermediate result
+				of a calculation */
+	SYM_LIT,		/*!< literal */
+	SYM_TABLE_REF_COUNTED,	/*!< database table name, ref counted. Must
+				be closed explicitly. */
+	SYM_TABLE,		/*!< database table name */
+	SYM_COLUMN,		/*!< database table name */
+	SYM_CURSOR,		/*!< named cursor */
+	SYM_PROCEDURE_NAME,	/*!< stored procedure name */
+	SYM_INDEX,		/*!< database index name */
+	SYM_FUNCTION		/*!< user function name */
+};
+
+/** Symbol table node */
+struct sym_node_t{
+	que_common_t			common;		/*!< node type:
+							QUE_NODE_SYMBOL */
+	/* NOTE: if the data field in 'common.val' is not NULL and the symbol
+	table node is not for a temporary column, the memory for the value has
+	been allocated from dynamic memory and it should be freed when the
+	symbol table is discarded */
+
+	/* 'alias' and 'indirection' are almost the same, but not quite.
+	'alias' always points to the primary instance of the variable, while
+	'indirection' does the same only if we should use the primary
+	instance's values for the node's data. This is usually the case, but
+	when initializing a cursor (e.g., "DECLARE CURSOR c IS SELECT * FROM
+	t WHERE id = x;"), we copy the values from the primary instance to
+	the cursor's instance so that they are fixed for the duration of the
+	cursor, and set 'indirection' to NULL. If we did not, the value of
+	'x' could change between fetches and things would break horribly.
+
+	TODO: It would be cleaner to make 'indirection' a boolean field and
+	always use 'alias' to refer to the primary node. */
+
+	sym_node_t*			indirection;	/*!< pointer to
+							another symbol table
+							node which contains
+							the value for this
+							node, NULL otherwise */
+	sym_node_t*			alias;		/*!< pointer to
+							another symbol table
+							node for which this
+							node is an alias,
+							NULL otherwise */
+	UT_LIST_NODE_T(sym_node_t)	col_var_list;	/*!< list of table
+							columns or a list of
+							input variables for an
+							explicit cursor */
+	ibool				copy_val;	/*!< TRUE if a column
+							and its value should
+							be copied to dynamic
+							memory when fetched */
+	ulint				field_nos[2];	/*!< if a column, in
+							the position
+							SYM_CLUST_FIELD_NO is
+							the field number in the
+							clustered index; in
+							the position
+							SYM_SEC_FIELD_NO
+							the field number in the
+							non-clustered index to
+							use first; if not found
+							from the index, then
+							ULINT_UNDEFINED */
+	ibool				resolved;	/*!< TRUE if the
+							meaning of a variable
+							or a column has been
+							resolved; for literals
+							this is always TRUE */
+	enum sym_tab_entry		token_type;	/*!< type of the
+							parsed token */
+	const char*			name;		/*!< name of an id */
+	ulint				name_len;	/*!< id name length */
+	dict_table_t*			table;		/*!< table definition
+							if a table id or a
+							column id */
+	ulint				col_no;		/*!< column number if a
+							column */
+	sel_buf_t*			prefetch_buf;	/*!< NULL, or a buffer
+							for cached column
+							values for prefetched
+							rows */
+	sel_node_t*			cursor_def;	/*!< cursor definition
+							select node if a
+							named cursor */
+	ulint				param_type;	/*!< PARS_INPUT,
+							PARS_OUTPUT, or
+							PARS_NOT_PARAM if not a
+							procedure parameter */
+	sym_tab_t*			sym_table;	/*!< back pointer to
+							the symbol table */
+	UT_LIST_NODE_T(sym_node_t)	sym_list;	/*!< list of symbol
+							nodes */
+	sym_node_t*			like_node;	/* LIKE operator node*/
+};
+
+/** Symbol table */
+struct sym_tab_t{
+	que_t*			query_graph;
+					/*!< query graph generated by the
+					parser */
+	const char*		sql_string;
+					/*!< SQL string to parse */
+	size_t			string_len;
+					/*!< SQL string length */
+	size_t			next_char_pos;
+					/*!< position of the next character in
+					sql_string to give to the lexical
+					analyzer */
+	pars_info_t*		info;	/*!< extra information, or NULL */
+	sym_node_list_t		sym_list;
+					/*!< list of symbol nodes in the symbol
+					table */
+	UT_LIST_BASE_NODE_T(func_node_t)
+				func_node_list;
+					/*!< list of function nodes in the
+					parsed query graph */
+	mem_heap_t*		heap;	/*!< memory heap from which we can
+					allocate space */
+};
+
+#endif
diff --git a/storage/innobase/include/pars0types.h b/storage/innobase/include/pars0types.h
new file mode 100644
index 00000000..f5b69522
--- /dev/null
+++ b/storage/innobase/include/pars0types.h
@@ -0,0 +1,50 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0types.h
+SQL parser global types
+
+Created 1/11/1998 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0types_h
+#define pars0types_h
+
+struct pars_info_t;
+struct pars_user_func_t;
+struct pars_bound_lit_t;
+struct pars_bound_id_t;
+struct sym_node_t;
+struct sym_tab_t;
+struct pars_res_word_t;
+struct func_node_t;
+struct order_node_t;
+struct proc_node_t;
+struct elsif_node_t;
+struct if_node_t;
+struct while_node_t;
+struct for_node_t;
+struct exit_node_t;
+struct return_node_t;
+struct assign_node_t;
+struct col_assign_node_t;
+
+typedef UT_LIST_BASE_NODE_T(sym_node_t)	sym_node_list_t;
+
+#endif
diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h
new file mode 100644
index 00000000..c60f390a
--- /dev/null
+++ b/storage/innobase/include/que0que.h
@@ -0,0 +1,314 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0que.h
+Query graph
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef que0que_h
+#define que0que_h
+
+#include "data0data.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "srv0srv.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "pars0types.h"
+
+/***********************************************************************//**
+Creates a query graph fork node.
+@return own: fork node */
+que_fork_t *que_fork_create(mem_heap_t* heap);
+/***********************************************************************//**
+Gets the first thr in a fork. */
+UNIV_INLINE
+que_thr_t*
+que_fork_get_first_thr(
+/*===================*/
+	que_fork_t*	fork);	/*!< in: query fork */
+/***********************************************************************//**
+Gets the child node of the first thr in a fork. */
+UNIV_INLINE
+que_node_t*
+que_fork_get_child(
+/*===============*/
+	que_fork_t*	fork);	/*!< in: query fork */
+/***********************************************************************//**
+Sets the parent of a graph node. */
+UNIV_INLINE
+void
+que_node_set_parent(
+/*================*/
+	que_node_t*	node,	/*!< in: graph node */
+	que_node_t*	parent);/*!< in: parent */
+/** Creates a query graph thread node.
+@param[in]	parent		parent node, i.e., a fork node
+@param[in]	heap		memory heap where created
+@param[in]	prebuilt	row prebuilt structure
+@return own: query thread node */
+que_thr_t*
+que_thr_create(
+	que_fork_t*	parent,
+	mem_heap_t*	heap,
+	row_prebuilt_t*	prebuilt);
+/**********************************************************************//**
+Frees a query graph, but not the heap where it was created. Does not free
+explicit cursor declarations, they are freed in que_graph_free. */
+void
+que_graph_free_recursive(
+/*=====================*/
+	que_node_t*	node);	/*!< in: query graph node */
+/**********************************************************************//**
+Frees a query graph. */
+void
+que_graph_free(
+/*===========*/
+	que_t*	graph);	/*!< in: query graph; we assume that the memory
+			heap where this graph was created is private
+			to this graph: if not, then use
+			que_graph_free_recursive and free the heap
+			afterwards! */
+
+/**********************************************************************//**
+Run a query thread. Handles lock waits. */
+void
+que_run_threads(
+/*============*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Starts execution of a command in a query fork. Picks a query thread which
+is not in the QUE_THR_RUNNING state and moves it to that state. If none
+can be chosen, a situation which may arise in parallelized fetches, NULL
+is returned.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+que_thr_t*
+que_fork_start_command(
+/*===================*/
+	que_fork_t*	fork);	/*!< in: a query fork */
+/***********************************************************************//**
+Gets the trx of a query thread. */
+UNIV_INLINE
+trx_t*
+thr_get_trx(
+/*========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/***********************************************************************//**
+Gets the type of a graph node. */
+UNIV_INLINE
+ulint
+que_node_get_type(
+/*==============*/
+	const que_node_t*	node);	/*!< in: graph node */
+/***********************************************************************//**
+Gets pointer to the value data type field of a graph node. */
+UNIV_INLINE
+dtype_t*
+que_node_get_data_type(
+/*===================*/
+	que_node_t*	node);	/*!< in: graph node */
+/***********************************************************************//**
+Gets pointer to the value dfield of a graph node. */
+UNIV_INLINE
+dfield_t*
+que_node_get_val(
+/*=============*/
+	que_node_t*	node);	/*!< in: graph node */
+/***********************************************************************//**
+Gets the value buffer size of a graph node.
+@return val buffer size, not defined if val.data == NULL in node */
+UNIV_INLINE
+ulint
+que_node_get_val_buf_size(
+/*======================*/
+	que_node_t*	node);	/*!< in: graph node */
+/***********************************************************************//**
+Sets the value buffer size of a graph node. */
+UNIV_INLINE
+void
+que_node_set_val_buf_size(
+/*======================*/
+	que_node_t*	node,	/*!< in: graph node */
+	ulint		size);	/*!< in: size */
+/*********************************************************************//**
+Gets the next list node in a list of query graph nodes. */
+UNIV_INLINE
+que_node_t*
+que_node_get_next(
+/*==============*/
+	que_node_t*	node);	/*!< in: node in a list */
+/*********************************************************************//**
+Gets the parent node of a query graph node.
+@return parent node or NULL */
+UNIV_INLINE
+que_node_t*
+que_node_get_parent(
+/*================*/
+	que_node_t*	node);	/*!< in: node */
+/****************************************************************//**
+Get the first containing loop node (e.g. while_node_t or for_node_t) for the
+given node, or NULL if the node is not within a loop.
+@return containing loop node, or NULL. */
+que_node_t*
+que_node_get_containing_loop_node(
+/*==============================*/
+	que_node_t*	node);	/*!< in: node */
+/*********************************************************************//**
+Catenates a query graph node to a list of them, possible empty list.
+@return one-way list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_list_add_last(
+/*===================*/
+	que_node_t*	node_list,	/*!< in: node list, or NULL */
+	que_node_t*	node);		/*!< in: node */
+/*************************************************************************
+Get the last node from the list.*/
+UNIV_INLINE
+que_node_t*
+que_node_list_get_last(
+/*===================*/
+					/* out: node last node from list.*/
+	que_node_t*	node_list);	/* in: node list, or NULL */
+/*********************************************************************//**
+Gets a query graph node list length.
+@return length, for NULL list 0 */
+UNIV_INLINE
+ulint
+que_node_list_get_len(
+/*==================*/
+	que_node_t*	node_list);	/*!< in: node list, or NULL */
+/*********************************************************************//**
+Evaluate the given SQL
+@return error code or DB_SUCCESS */
+dberr_t
+que_eval_sql(
+/*=========*/
+	pars_info_t*	info,	/*!< in: info struct, or NULL */
+	const char*	sql,	/*!< in: SQL string */
+	trx_t*		trx);	/*!< in: trx */
+
+/**********************************************************************//**
+Round robin scheduler.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+que_thr_t*
+que_fork_scheduler_round_robin(
+/*===========================*/
+	que_fork_t*	fork,		/*!< in: a query fork */
+	que_thr_t*	thr);		/*!< in: current pos */
+
+/** Query thread states */
+enum que_thr_state_t {
+	/** in selects this means that the thread is at the end of its
+	result set (or start, in case of a scroll cursor); in other
+	statements, this means the thread has done its task */
+	QUE_THR_COMPLETED,
+	QUE_THR_RUNNING
+};
+
+/** Query thread lock states */
+enum que_thr_lock_t {
+	QUE_THR_LOCK_NOLOCK,
+	QUE_THR_LOCK_ROW,
+	QUE_THR_LOCK_TABLE
+};
+
+/* Query graph query thread node: the fields are protected by the
+trx_t::mutex with the exceptions named below */
+
+struct que_thr_t{
+	que_common_t	common;		/*!< type: QUE_NODE_THR */
+	que_node_t*	child;		/*!< graph child node */
+	que_t*		graph;		/*!< graph where this node belongs */
+	que_thr_state_t	state;		/*!< state of the query thread */
+	/*------------------------------*/
+	/* The following fields are private to the OS thread executing the
+	query thread, and are not protected by any mutex: */
+
+	que_node_t*	run_node;	/*!< pointer to the node where the
+					subgraph down from this node is
+					currently executed */
+	que_node_t*	prev_node;	/*!< pointer to the node from which
+					the control came */
+	ulint		resource;	/*!< resource usage of the query thread
+					thus far */
+	ulint		lock_state;	/*!< lock state of thread (table or
+					row) */
+	/*------------------------------*/
+	/* The following fields are links for the various lists that
+	this type can be on. */
+	UT_LIST_NODE_T(que_thr_t)
+			thrs;		/*!< list of thread nodes of the fork
+					node */
+	UT_LIST_NODE_T(que_thr_t)
+			queue;		/*!< list of runnable thread nodes in
+					the server task queue */
+	ulint		fk_cascade_depth; /*!< maximum cascading call depth
+					supported for foreign key constraint
+					related delete/updates */
+	row_prebuilt_t*	prebuilt;	/*!< prebuilt structure processed by
+					the query thread */
+};
+
+/* Query graph fork node: its fields are protected by the query thread mutex */
+struct que_fork_t{
+	que_common_t	common;		/*!< type: QUE_NODE_FORK */
+	que_t*		graph;		/*!< query graph of this node */
+	trx_t*		trx;		/*!< transaction: this is set only in
+					the root node */
+	ulint		state;		/*!< state of the fork node */
+	que_thr_t*	caller;		/*!< pointer to a possible calling query
+					thread */
+	UT_LIST_BASE_NODE_T(que_thr_t)
+			thrs;		/*!< list of query threads */
+	/*------------------------------*/
+	/* The fields in this section are defined only in the root node */
+	sym_tab_t*	sym_tab;	/*!< symbol table of the query,
+					generated by the parser, or NULL
+					if the graph was created 'by hand' */
+	pars_info_t*	info;		/*!< info struct, or NULL */
+
+	sel_node_t*	last_sel_node;	/*!< last executed select node, or NULL
+					if none */
+	UT_LIST_NODE_T(que_fork_t)
+			graphs;		/*!< list of query graphs of a session
+					or a stored procedure */
+	/*------------------------------*/
+	mem_heap_t*	heap;		/*!< memory heap where the fork was
+					created */
+
+};
+
+/* Query fork (or graph) states */
+#define QUE_FORK_ACTIVE		1
+#define QUE_FORK_COMMAND_WAIT	2
+
+/* Flag which is ORed to control structure statement node types */
+#define QUE_NODE_CONTROL_STAT	1024
+
+#include "que0que.inl"
+
+#endif
diff --git a/storage/innobase/include/que0que.inl b/storage/innobase/include/que0que.inl
new file mode 100644
index 00000000..e21cbad3
--- /dev/null
+++ b/storage/innobase/include/que0que.inl
@@ -0,0 +1,245 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0que.ic
+Query graph
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+/***********************************************************************//**
+Gets the trx of a query thread. */
+UNIV_INLINE
+trx_t*
+thr_get_trx(
+/*========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ut_ad(thr);
+
+	return(thr->graph->trx);
+}
+
+/***********************************************************************//**
+Gets the first thr in a fork. */
+UNIV_INLINE
+que_thr_t*
+que_fork_get_first_thr(
+/*===================*/
+	que_fork_t*	fork)	/*!< in: query fork */
+{
+	return(UT_LIST_GET_FIRST(fork->thrs));
+}
+
+/***********************************************************************//**
+Gets the child node of the first thr in a fork. */
+UNIV_INLINE
+que_node_t*
+que_fork_get_child(
+/*===============*/
+	que_fork_t*	fork)	/*!< in: query fork */
+{
+	que_thr_t*	thr;
+
+	thr = UT_LIST_GET_FIRST(fork->thrs);
+
+	return(thr->child);
+}
+
+/***********************************************************************//**
+Gets the type of a graph node. */
+UNIV_INLINE
+ulint
+que_node_get_type(
+/*==============*/
+	const que_node_t*	node)	/*!< in: graph node */
+{
+	return(reinterpret_cast<const que_common_t*>(node)->type);
+}
+
+/***********************************************************************//**
+Gets pointer to the value dfield of a graph node. */
+UNIV_INLINE
+dfield_t*
+que_node_get_val(
+/*=============*/
+	que_node_t*	node)	/*!< in: graph node */
+{
+	ut_ad(node);
+
+	return(&(((que_common_t*) node)->val));
+}
+
+/***********************************************************************//**
+Gets the value buffer size of a graph node.
+@return val buffer size, not defined if val.data == NULL in node */
+UNIV_INLINE
+ulint
+que_node_get_val_buf_size(
+/*======================*/
+	que_node_t*	node)	/*!< in: graph node */
+{
+	ut_ad(node);
+
+	return(((que_common_t*) node)->val_buf_size);
+}
+
+/***********************************************************************//**
+Sets the value buffer size of a graph node. */
+UNIV_INLINE
+void
+que_node_set_val_buf_size(
+/*======================*/
+	que_node_t*	node,	/*!< in: graph node */
+	ulint		size)	/*!< in: size */
+{
+	ut_ad(node);
+
+	((que_common_t*) node)->val_buf_size = size;
+}
+
+/***********************************************************************//**
+Sets the parent of a graph node. */
+UNIV_INLINE
+void
+que_node_set_parent(
+/*================*/
+	que_node_t*	node,	/*!< in: graph node */
+	que_node_t*	parent)	/*!< in: parent */
+{
+	ut_ad(node);
+
+	((que_common_t*) node)->parent = parent;
+}
+
+/***********************************************************************//**
+Gets pointer to the value data type field of a graph node. */
+UNIV_INLINE
+dtype_t*
+que_node_get_data_type(
+/*===================*/
+	que_node_t*	node)	/*!< in: graph node */
+{
+	ut_ad(node);
+
+	return(dfield_get_type(&((que_common_t*) node)->val));
+}
+
+/*********************************************************************//**
+Catenates a query graph node to a list of them, possible empty list.
+@return one-way list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_list_add_last(
+/*===================*/
+	que_node_t*	node_list,	/*!< in: node list, or NULL */
+	que_node_t*	node)		/*!< in: node */
+{
+	que_common_t*	cnode;
+	que_common_t*	cnode2;
+
+	cnode = (que_common_t*) node;
+
+	cnode->brother = NULL;
+
+	if (node_list == NULL) {
+
+		return(node);
+	}
+
+	cnode2 = (que_common_t*) node_list;
+
+	while (cnode2->brother != NULL) {
+		cnode2 = (que_common_t*) cnode2->brother;
+	}
+
+	cnode2->brother = node;
+
+	return(node_list);
+}
+
+/*************************************************************************
+Removes a query graph node from the list.*/
+UNIV_INLINE
+que_node_t*
+que_node_list_get_last(
+/*===================*/
+					/* out: last node in list.*/
+	que_node_t*	node_list)	/* in: node list */
+{
+	que_common_t*	node;
+
+	ut_a(node_list != NULL);
+
+	node = (que_common_t*) node_list;
+
+	/* We need the last element */
+	while (node->brother != NULL) {
+		node = (que_common_t*) node->brother;
+	}
+
+	return(node);
+}
+/*********************************************************************//**
+Gets the next list node in a list of query graph nodes.
+@return next node in a list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_get_next(
+/*==============*/
+	que_node_t*	node)	/*!< in: node in a list */
+{
+	return(((que_common_t*) node)->brother);
+}
+
+/*********************************************************************//**
+Gets a query graph node list length.
+@return length, for NULL list 0 */
+UNIV_INLINE
+ulint
+que_node_list_get_len(
+/*==================*/
+	que_node_t*	node_list)	/*!< in: node list, or NULL */
+{
+	const que_common_t*	cnode;
+	ulint			len;
+
+	cnode = (const que_common_t*) node_list;
+	len = 0;
+
+	while (cnode != NULL) {
+		len++;
+		cnode = (const que_common_t*) cnode->brother;
+	}
+
+	return(len);
+}
+
+/*********************************************************************//**
+Gets the parent node of a query graph node.
+@return parent node or NULL */
+UNIV_INLINE
+que_node_t*
+que_node_get_parent(
+/*================*/
+	que_node_t*	node)	/*!< in: node */
+{
+	return(((que_common_t*) node)->parent);
+}
diff --git a/storage/innobase/include/que0types.h b/storage/innobase/include/que0types.h
new file mode 100644
index 00000000..38f6e380
--- /dev/null
+++ b/storage/innobase/include/que0types.h
@@ -0,0 +1,97 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0types.h
+Query graph global types
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef que0types_h
+#define que0types_h
+
+#include "data0data.h"
+
+/* Pseudotype for all graph nodes */
+typedef void	que_node_t;
+
+/* Query graph root is a fork node */
+typedef	struct que_fork_t	que_t;
+
+struct row_prebuilt_t;
+struct que_thr_t;
+
+/* Query graph node types */
+#define	QUE_NODE_LOCK		1
+#define	QUE_NODE_INSERT		2
+#define QUE_NODE_UPDATE		4
+#define	QUE_NODE_CURSOR		5
+#define	QUE_NODE_SELECT		6
+#define	QUE_NODE_AGGREGATE	7
+#define QUE_NODE_FORK		8
+#define QUE_NODE_THR		9
+#define QUE_NODE_UNDO		10
+#define QUE_NODE_COMMIT		11
+#define QUE_NODE_ROLLBACK	12
+#define QUE_NODE_PURGE		13
+#define QUE_NODE_CREATE_TABLE	14
+#define QUE_NODE_CREATE_INDEX	15
+#define QUE_NODE_SYMBOL		16
+#define QUE_NODE_RES_WORD	17
+#define QUE_NODE_FUNC		18
+#define QUE_NODE_ORDER		19
+#define QUE_NODE_PROC		(20 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_IF		(21 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_WHILE		(22 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_ASSIGNMENT	23
+#define QUE_NODE_FETCH		24
+#define QUE_NODE_OPEN		25
+#define QUE_NODE_COL_ASSIGNMENT	26
+#define QUE_NODE_FOR		(27 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_RETURN		28
+#define QUE_NODE_ROW_PRINTF	29
+#define QUE_NODE_ELSIF		30
+#define QUE_NODE_CALL		31
+#define QUE_NODE_EXIT		32
+
+/* Common struct at the beginning of each query graph node; the name of this
+substruct must be 'common' */
+
+struct que_common_t{
+	ulint		type;	/*!< query node type */
+	que_node_t*	parent;	/*!< back pointer to parent node, or NULL */
+	que_node_t*	brother;/* pointer to a possible brother node */
+	dfield_t	val;	/*!< evaluated value for an expression */
+	ulint		val_buf_size;
+				/* buffer size for the evaluated value data,
+				if the buffer has been allocated dynamically:
+				if this field is != 0, and the node is a
+				symbol node or a function node, then we
+				have to free the data field in val
+				explicitly */
+
+	/** Constructor */
+	que_common_t(ulint type, que_node_t* parent) :
+		type(type), parent(parent), brother(NULL),
+		val(), val_buf_size(0)
+	{}
+};
+
+#endif
diff --git a/storage/innobase/include/read0types.h b/storage/innobase/include/read0types.h
new file mode 100644
index 00000000..e002f1b7
--- /dev/null
+++ b/storage/innobase/include/read0types.h
@@ -0,0 +1,275 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/read0types.h
+Cursor read
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "dict0mem.h"
+#include "trx0types.h"
+#include "srw_lock.h"
+#include <algorithm>
+
+/**
+  Read view lists the trx ids of those transactions for which a consistent read
+  should not see the modifications to the database.
+*/
+class ReadViewBase
+{
+  /**
+    The read should not see any transaction with trx id >= this value.
+    In other words, this is the "high water mark".
+  */
+  trx_id_t m_low_limit_id= 0;
+
+  /**
+    The read should see all trx ids which are strictly
+    smaller (<) than this value. In other words, this is the
+    low water mark".
+  */
+  trx_id_t m_up_limit_id;
+
+  /** Set of RW transactions that was active when this snapshot was taken */
+  trx_ids_t m_ids;
+
+  /**
+    The view does not need to see the undo logs for transactions whose
+    transaction number is strictly smaller (<) than this value: they can be
+    removed in purge if not needed by other views.
+  */
+  trx_id_t m_low_limit_no;
+
+protected:
+  bool empty() { return m_ids.empty(); }
+
+  /** @return the up limit id */
+  trx_id_t up_limit_id() const { return m_up_limit_id; }
+
+public:
+  /**
+    Append state from another view.
+
+    This method is used to find min(m_low_limit_no), min(m_low_limit_id) and
+    all transaction ids below min(m_low_limit_id). These values effectively
+    form oldest view.
+
+    @param other    view to copy from
+  */
+  void append(const ReadViewBase &other)
+  {
+    ut_ad(&other != this);
+    if (m_low_limit_no > other.m_low_limit_no)
+      m_low_limit_no= other.m_low_limit_no;
+    if (m_low_limit_id > other.m_low_limit_id)
+      m_low_limit_id= other.m_low_limit_id;
+
+    trx_ids_t::iterator dst= m_ids.begin();
+    for (const trx_id_t id : other.m_ids)
+    {
+      if (id >= m_low_limit_id)
+        break;
+loop:
+      if (dst == m_ids.end())
+      {
+        m_ids.push_back(id);
+        dst= m_ids.end();
+        continue;
+      }
+      if (*dst < id)
+      {
+        dst++;
+        goto loop;
+      }
+      else if (*dst > id)
+        dst= m_ids.insert(dst, id) + 1;
+    }
+    m_ids.erase(std::lower_bound(dst, m_ids.end(), m_low_limit_id),
+                m_ids.end());
+
+    m_up_limit_id= m_ids.empty() ? m_low_limit_id : m_ids.front();
+    ut_ad(m_up_limit_id <= m_low_limit_id);
+  }
+
+
+  /**
+    Creates a snapshot where exactly the transactions serialized before this
+    point in time are seen in the view.
+
+    @param[in,out] trx transaction
+  */
+  inline void snapshot(trx_t *trx);
+
+
+  /**
+    Check whether the changes by id are visible.
+    @param[in] id transaction id to check against the view
+    @return whether the view sees the modifications of id.
+  */
+  bool changes_visible(trx_id_t id) const
+  MY_ATTRIBUTE((warn_unused_result))
+  {
+    if (id >= m_low_limit_id)
+      return false;
+    return id < m_up_limit_id ||
+           m_ids.empty() ||
+           !std::binary_search(m_ids.begin(), m_ids.end(), id);
+  }
+
+  /**
+    @param id transaction to check
+    @return true if view sees transaction id
+  */
+  bool sees(trx_id_t id) const { return id < m_up_limit_id; }
+
+  /** @return the low limit no */
+  trx_id_t low_limit_no() const { return m_low_limit_no; }
+
+  /** @return the low limit id */
+  trx_id_t low_limit_id() const { return m_low_limit_id; }
+
+  /** Clamp the low limit id for purge_sys.end_view */
+  void clamp_low_limit_id(trx_id_t limit)
+  {
+    if (m_low_limit_id > limit)
+      m_low_limit_id= limit;
+  }
+};
+
+
+/** A ReadView with extra members required for trx_t::read_view. */
+class ReadView: public ReadViewBase
+{
+  /**
+    View state.
+
+    Implemented as atomic to allow mutex-free view close and re-use.
+    Non-owner thread is allowed to call is_open() alone without mutex
+    protection as well. E.g. trx_sys.view_count() does this.
+
+    If non-owner thread intends to access other members as well, both
+    is_open() and other members accesses must be protected by m_mutex.
+    E.g. copy_to().
+  */
+  std::atomic<bool> m_open;
+
+  /** For synchronisation with purge coordinator. */
+  mutable srw_mutex m_mutex;
+
+  /**
+    trx id of creating transaction.
+    Used exclusively by the read view owner thread.
+  */
+  trx_id_t m_creator_trx_id;
+
+public:
+  ReadView()
+  {
+    memset(reinterpret_cast<void*>(this), 0, sizeof *this);
+    m_mutex.init();
+  }
+  ~ReadView() { m_mutex.destroy(); }
+
+
+  /**
+    Opens a read view where exactly the transactions serialized before this
+    point in time are seen in the view.
+
+    View becomes visible to purge thread. Intended to be called by the ReadView
+    owner thread.
+
+    @param[in,out] trx transaction
+  */
+  void open(trx_t *trx);
+
+
+  /**
+    Closes the view.
+
+    View becomes not visible to purge thread. Intended to be called by the
+    ReadView owner thread.
+  */
+  void close() { m_open.store(false, std::memory_order_relaxed); }
+
+
+  /** Returns true if view is open. */
+  bool is_open() const { return m_open.load(std::memory_order_relaxed); }
+
+
+  /**
+    Sets the creator transaction id.
+
+    This should be set only for views created by RW transactions.
+    Intended to be called by the ReadView owner thread.
+  */
+  void set_creator_trx_id(trx_id_t id)
+  {
+    ut_ad(m_creator_trx_id == 0);
+    m_creator_trx_id= id;
+  }
+
+
+  /**
+    Writes the limits to the file.
+    @param file file to write to
+  */
+  void print_limits(FILE *file) const
+  {
+    m_mutex.wr_lock();
+    if (is_open())
+      fprintf(file, "Trx read view will not see trx with"
+                    " id >= " TRX_ID_FMT ", sees < " TRX_ID_FMT "\n",
+                    low_limit_id(), up_limit_id());
+    m_mutex.wr_unlock();
+  }
+
+
+  /**
+    A wrapper around ReadViewBase::changes_visible().
+    Intended to be called by the ReadView owner thread.
+  */
+  bool changes_visible(trx_id_t id) const
+  { return id == m_creator_trx_id || ReadViewBase::changes_visible(id); }
+
+  /**
+    A wrapper around ReadViewBase::append().
+    Intended to be called by the purge coordinator task.
+  */
+  void append_to(ReadViewBase *to) const
+  {
+    m_mutex.wr_lock();
+    if (is_open())
+      to->append(*this);
+    m_mutex.wr_unlock();
+  }
+
+  /**
+    Declare the object mostly unaccessible.
+  */
+  void mem_noaccess() const
+  {
+    MEM_NOACCESS(&m_open, sizeof m_open);
+    /* m_mutex is accessed via trx_sys.rw_trx_hash */
+    MEM_NOACCESS(&m_creator_trx_id, sizeof m_creator_trx_id);
+  }
+};
diff --git a/storage/innobase/include/rem0cmp.h b/storage/innobase/include/rem0cmp.h
new file mode 100644
index 00000000..3a30f5a9
--- /dev/null
+++ b/storage/innobase/include/rem0cmp.h
@@ -0,0 +1,286 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/rem0cmp.h
+Comparison services for records
+
+Created 7/1/1994 Heikki Tuuri
+************************************************************************/
+
+#pragma once
+
+#include "data0data.h"
+#include "data0type.h"
+#include "rem0types.h"
+#include "page0types.h"
+
+/*************************************************************//**
+Returns TRUE if two columns are equal for comparison purposes.
+@return TRUE if the columns are considered equal in comparisons */
+ibool
+cmp_cols_are_equal(
+/*===============*/
+	const dict_col_t*	col1,	/*!< in: column 1 */
+	const dict_col_t*	col2,	/*!< in: column 2 */
+	ibool			check_charsets);
+					/*!< in: whether to check charsets */
+/** Compare two data fields.
+@param mtype          main type
+@param prtype         precise type
+@param descending     whether to use descending order
+@param data1          data field
+@param len1           length of data1 in bytes, or UNIV_SQL_NULL
+@param data2          data field
+@param len2           length of data2 in bytes, or UNIV_SQL_NULL
+@return the comparison result of data1 and data2
+@retval 0 if data1 is equal to data2
+@retval negative if data1 is less than data2
+@retval positive if data1 is greater than data2 */
+int cmp_data(ulint mtype, ulint prtype, bool descending,
+             const byte *data1, size_t len1, const byte *data2, size_t len2)
+  MY_ATTRIBUTE((warn_unused_result));
+
+/** Compare two data fields.
+@param dfield1       data field; must have type field set
+@param dfield2       data field
+@param descending    whether to use descending order
+@return the comparison result of dfield1 and dfield2
+@retval 0 if dfield1 is equal to dfield2
+@retval negative if dfield1 is less than dfield2
+@retval positive if dfield1 is greater than dfield2 */
+inline int cmp_dfield_dfield(const dfield_t *dfield1, const dfield_t *dfield2,
+                             bool descending= false)
+{
+  ut_ad(dfield_check_typed(dfield1));
+  const dtype_t *type= dfield_get_type(dfield1);
+  return cmp_data(type->mtype, type->prtype, descending,
+                  static_cast<const byte*>(dfield_get_data(dfield1)),
+                  dfield_get_len(dfield1),
+                  static_cast<const byte*>(dfield_get_data(dfield2)),
+                  dfield_get_len(dfield2));
+}
+
+#ifdef UNIV_DEBUG
+/** Compare a GIS data tuple to a physical record.
+@param[in] dtuple data tuple
+@param[in] rec R-tree record
+@param[in] mode compare mode
+@retval negative if dtuple is less than rec */
+int cmp_dtuple_rec_with_gis(const dtuple_t *dtuple, const rec_t *rec,
+                            page_cur_mode_t mode)
+  MY_ATTRIBUTE((nonnull));
+#endif
+
+/** Compare two minimum bounding rectangles.
+@return	1, 0, -1, if a is greater, equal, less than b, respectively */
+inline int cmp_geometry_field(const void *a, const void *b)
+{
+  const byte *mbr1= static_cast<const byte*>(a);
+  const byte *mbr2= static_cast<const byte*>(b);
+
+  static_assert(SPDIMS == 2, "compatibility");
+  static_assert(DATA_MBR_LEN == SPDIMS * 2 * sizeof(double), "compatibility");
+
+  /* Try to compare mbr left lower corner (xmin, ymin) */
+  double x1= mach_double_read(mbr1);
+  double x2= mach_double_read(mbr2);
+  if (x1 > x2)
+    return 1;
+  if (x1 < x2)
+    return -1;
+
+  x1= mach_double_read(mbr1 + sizeof(double) * SPDIMS);
+  x2= mach_double_read(mbr2 + sizeof(double) * SPDIMS);
+
+  if (x1 > x2)
+    return 1;
+  if (x1 < x2)
+    return -1;
+
+  /* left lower corner (xmin, ymin) overlaps, now right upper corner */
+  x1= mach_double_read(mbr1 + sizeof(double));
+  x2= mach_double_read(mbr2 + sizeof(double));
+
+  if (x1 > x2)
+    return 1;
+  if (x1 < x2)
+    return -1;
+
+  x1= mach_double_read(mbr1 + sizeof(double) * 2 + sizeof(double));
+  x2= mach_double_read(mbr2 + sizeof(double) * 2 + sizeof(double));
+
+  if (x1 > x2)
+    return 1;
+  if (x1 < x2)
+    return -1;
+
+  return 0;
+}
+
+/** Compare a data tuple to a physical record.
+@param dtuple          data tuple
+@param rec             B-tree index record
+@param index           B-tree index
+@param offsets         rec_get_offsets(rec,index)
+@param n_cmp           number of fields to compare
+@param matched_fields  number of completely matched fields
+@return the comparison result of dtuple and rec
+@retval 0 if dtuple is equal to rec
+@retval negative if dtuple is less than rec
+@retval positive if dtuple is greater than rec */
+int cmp_dtuple_rec_with_match_low(const dtuple_t *dtuple, const rec_t *rec,
+                                  const dict_index_t *index,
+                                  const rec_offs *offsets,
+                                  ulint n_cmp, ulint *matched_fields)
+  MY_ATTRIBUTE((nonnull));
+#define cmp_dtuple_rec_with_match(tuple,rec,index,offsets,fields)	\
+	cmp_dtuple_rec_with_match_low(					\
+		tuple,rec,index,offsets,dtuple_get_n_fields_cmp(tuple),fields)
+/** Compare a data tuple to a physical record.
+@param[in]	dtuple		data tuple
+@param[in]	rec		B-tree or R-tree index record
+@param[in]	index		index tree
+@param[in]	offsets		rec_get_offsets(rec)
+@param[in,out]	matched_fields	number of completely matched fields
+@param[in,out]	matched_bytes	number of matched bytes in the first
+field that is not matched
+@return the comparison result of dtuple and rec
+@retval 0 if dtuple is equal to rec
+@retval negative if dtuple is less than rec
+@retval positive if dtuple is greater than rec */
+int
+cmp_dtuple_rec_with_match_bytes(
+	const dtuple_t*		dtuple,
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	const rec_offs*		offsets,
+	ulint*			matched_fields,
+	ulint*			matched_bytes)
+	MY_ATTRIBUTE((warn_unused_result));
+/** Compare a data tuple to a physical record.
+@see cmp_dtuple_rec_with_match
+@param dtuple  data tuple
+@param rec     index record
+@param index   index
+@param offsets rec_get_offsets(rec, index)
+@return the comparison result of dtuple and rec
+@retval 0 if dtuple is equal to rec
+@retval negative if dtuple is less than rec
+@retval positive if dtuple is greater than rec */
+inline int cmp_dtuple_rec(const dtuple_t *dtuple, const rec_t *rec,
+                          const dict_index_t *index, const rec_offs *offsets)
+{
+  ulint matched= 0;
+  return cmp_dtuple_rec_with_match(dtuple, rec, index, offsets, &matched);
+}
+
+/** Check if a dtuple is a prefix of a record.
+@param dtuple  data tuple
+@param rec     index record
+@param index   index
+@param offsets rec_get_offsets(rec)
+@return whether dtuple is a prefix of rec */
+bool cmp_dtuple_is_prefix_of_rec(const dtuple_t *dtuple, const rec_t *rec,
+                                 const dict_index_t *index,
+                                 const rec_offs *offsets)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Compare two physical records that contain the same number of columns,
+none of which are stored externally.
+@retval positive if rec1 (including non-ordering columns) is greater than rec2
+@retval negative if rec1 (including non-ordering columns) is less than rec2
+@retval 0 if rec1 is a duplicate of rec2 */
+int
+cmp_rec_rec_simple(
+/*===============*/
+	const rec_t*		rec1,	/*!< in: physical record */
+	const rec_t*		rec2,	/*!< in: physical record */
+	const rec_offs*		offsets1,/*!< in: rec_get_offsets(rec1, ...) */
+	const rec_offs*		offsets2,/*!< in: rec_get_offsets(rec2, ...) */
+	const dict_index_t*	index,	/*!< in: data dictionary index */
+	struct TABLE*		table)	/*!< in: MySQL table, for reporting
+					duplicate key value if applicable,
+					or NULL */
+	MY_ATTRIBUTE((nonnull(1,2,3,4), warn_unused_result));
+
+/** Compare two B-tree or R-tree records.
+Only the common first fields are compared, and externally stored field
+are treated as equal.
+@param[in]	rec1		record (possibly not on an index page)
+@param[in]	rec2		B-tree or R-tree record in an index page
+@param[in]	offsets1	rec_get_offsets(rec1, index)
+@param[in]	offsets2	rec_get_offsets(rec2, index)
+@param[in]	nulls_unequal	true if this is for index cardinality
+				statistics estimation with
+				innodb_stats_method=nulls_unequal
+				or innodb_stats_method=nulls_ignored
+@param[out]	matched_fields	number of completely matched fields
+				within the first field not completely matched
+@retval 0 if rec1 is equal to rec2
+@retval negative if rec1 is less than rec2
+@retval positive if rec1 is greater than rec2 */
+int
+cmp_rec_rec(
+	const rec_t*		rec1,
+	const rec_t*		rec2,
+	const rec_offs*		offsets1,
+	const rec_offs*		offsets2,
+	const dict_index_t*	index,
+	bool			nulls_unequal = false,
+	ulint*			matched_fields = NULL)
+	MY_ATTRIBUTE((nonnull(1,2,3,4,5)));
+
+/** Compare two data fields.
+@param dfield1        data field
+@param dfield2        data field
+@return the comparison result of dfield1 and dfield2
+@retval true if dfield1 is equal to dfield2, or a prefix of dfield1
+@retval false otherwise */
+inline bool cmp_dfield_dfield_eq_prefix(const dfield_t *dfield1,
+                                        const dfield_t *dfield2)
+{
+  ut_ad(dfield_check_typed(dfield1));
+  ut_ad(dfield_check_typed(dfield2));
+  const dtype_t *type= dfield_get_type(dfield1);
+
+#ifdef UNIV_DEBUG
+  switch (type->prtype & DATA_MYSQL_TYPE_MASK) {
+  case MYSQL_TYPE_BIT:
+  case MYSQL_TYPE_STRING:
+  case MYSQL_TYPE_VAR_STRING:
+  case MYSQL_TYPE_TINY_BLOB:
+  case MYSQL_TYPE_MEDIUM_BLOB:
+  case MYSQL_TYPE_BLOB:
+  case MYSQL_TYPE_LONG_BLOB:
+  case MYSQL_TYPE_VARCHAR:
+    break;
+  default:
+    ut_error;
+  }
+#endif /* UNIV_DEBUG */
+
+  uint cs_num= dtype_get_charset_coll(type->prtype);
+  CHARSET_INFO *cs= get_charset(cs_num, MYF(MY_WME));
+  ut_a(cs);
+  return !cs->strnncoll(static_cast<const uchar*>(dfield_get_data(dfield1)),
+                        dfield_get_len(dfield1),
+                        static_cast<const uchar*>(dfield_get_data(dfield2)),
+                        dfield_get_len(dfield2), 1);
+}
diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h
new file mode 100644
index 00000000..2f038ab3
--- /dev/null
+++ b/storage/innobase/include/rem0rec.h
@@ -0,0 +1,1276 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0rec.h
+Record manager
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef rem0rec_h
+#define rem0rec_h
+
+#ifndef UNIV_INNOCHECKSUM
+#include "data0data.h"
+#include "rem0types.h"
+#include "mtr0types.h"
+#include "page0types.h"
+#include "dict0dict.h"
+#include "trx0types.h"
+#endif /*! UNIV_INNOCHECKSUM */
+#include <ostream>
+#include <sstream>
+
+/* Number of extra bytes in an old-style record,
+in addition to the data and the offsets */
+#define REC_N_OLD_EXTRA_BYTES	6
+/* Number of extra bytes in a new-style record,
+in addition to the data and the offsets */
+#define REC_N_NEW_EXTRA_BYTES	5
+
+#define REC_NEW_STATUS		3	/* This is single byte bit-field */
+#define REC_NEW_STATUS_MASK	0x7UL
+#define REC_NEW_STATUS_SHIFT	0
+
+/* The following four constants are needed in page0zip.cc in order to
+efficiently compress and decompress pages. */
+
+/* The offset of heap_no in a compact record */
+#define REC_NEW_HEAP_NO		4
+/* The shift of heap_no in a compact record.
+The status is stored in the low-order bits. */
+#define	REC_HEAP_NO_SHIFT	3
+
+/* Length of a B-tree node pointer, in bytes */
+#define REC_NODE_PTR_SIZE	4
+
+#ifndef UNIV_INNOCHECKSUM
+/** SQL null flag in a 1-byte offset of ROW_FORMAT=REDUNDANT records */
+constexpr rec_offs REC_1BYTE_SQL_NULL_MASK= 0x80;
+/** SQL null flag in a 2-byte offset of ROW_FORMAT=REDUNDANT records */
+constexpr rec_offs REC_2BYTE_SQL_NULL_MASK= 0x8000;
+
+/** In a 2-byte offset of ROW_FORMAT=REDUNDANT records, the second most
+significant bit denotes that the tail of a field is stored off-page. */
+constexpr rec_offs REC_2BYTE_EXTERN_MASK= 0x4000;
+
+constexpr size_t RECORD_OFFSET= 2;
+constexpr size_t INDEX_OFFSET=
+    RECORD_OFFSET + sizeof(rec_t *) / sizeof(rec_offs);
+#endif /* UNIV_INNOCHECKSUM */
+
+/* Length of the rec_get_offsets() header */
+constexpr size_t REC_OFFS_HEADER_SIZE=
+#ifdef UNIV_DEBUG
+#ifndef UNIV_INNOCHECKSUM
+    sizeof(rec_t *) / sizeof(rec_offs) +
+    sizeof(dict_index_t *) / sizeof(rec_offs) +
+#endif /* UNIV_INNOCHECKSUM */
+#endif /* UNIV_DEBUG */
+    2;
+
+/* Number of elements that should be initially allocated for the
+offsets[] array, first passed to rec_get_offsets() */
+constexpr size_t REC_OFFS_NORMAL_SIZE= 300;
+constexpr size_t REC_OFFS_SMALL_SIZE= 18;
+constexpr size_t REC_OFFS_SEC_INDEX_SIZE=
+    /* PK max key parts */ 16 + /* sec idx max key parts */ 16 +
+    /* child page number for non-leaf pages */ 1;
+
+/** Get the base address of offsets.  The extra_size is stored at
+this position, and following positions hold the end offsets of
+the fields. */
+#define rec_offs_base(offsets) (offsets + REC_OFFS_HEADER_SIZE)
+
+#ifndef UNIV_INNOCHECKSUM
+/* Offset consists of two parts: 2 upper bits is type and all other bits is
+value */
+
+/** Only 4 different values is possible! */
+enum field_type_t
+{
+  /** normal field */
+  STORED_IN_RECORD= 0 << 14,
+  /** this field is stored off-page */
+  STORED_OFFPAGE= 1 << 14,
+  /** just an SQL NULL */
+  SQL_NULL= 2 << 14,
+  /** instantly added field */
+  DEFAULT= 3 << 14,
+};
+
+/** without 2 upper bits */
+static constexpr rec_offs DATA_MASK= 0x3fff;
+/** 2 upper bits */
+static constexpr rec_offs TYPE_MASK= ~DATA_MASK;
+inline field_type_t get_type(rec_offs n)
+{
+  return static_cast<field_type_t>(n & TYPE_MASK);
+}
+inline void set_type(rec_offs &n, field_type_t type)
+{
+  n= static_cast<rec_offs>((n & DATA_MASK) | type);
+}
+inline rec_offs get_value(rec_offs n) { return n & DATA_MASK; }
+inline rec_offs combine(rec_offs value, field_type_t type)
+{
+  return static_cast<rec_offs>(get_value(value) | type);
+}
+
+/** Compact flag ORed to the extra size returned by rec_get_offsets() */
+constexpr rec_offs REC_OFFS_COMPACT= rec_offs(~(rec_offs(~0) >> 1));
+/** External flag in offsets returned by rec_get_offsets() */
+constexpr rec_offs REC_OFFS_EXTERNAL= REC_OFFS_COMPACT >> 1;
+/** Default value flag in offsets returned by rec_get_offsets() */
+constexpr rec_offs REC_OFFS_DEFAULT= REC_OFFS_COMPACT >> 2;
+constexpr rec_offs REC_OFFS_MASK= REC_OFFS_DEFAULT - 1;
+
+/******************************************************//**
+The following function is used to get the offset of the
+next chained record on the same page.
+@return the page offset of the next chained record, or 0 if none */
+UNIV_INLINE
+ulint
+rec_get_next_offs(
+/*==============*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+	MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to set the next record offset field
+of an old-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_old(
+/*==================*/
+	rec_t*	rec,	/*!< in: old-style physical record */
+	ulint	next)	/*!< in: offset of the next record */
+	MY_ATTRIBUTE((nonnull));
+/******************************************************//**
+The following function is used to set the next record offset field
+of a new-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_new(
+/*==================*/
+	rec_t*	rec,	/*!< in/out: new-style physical record */
+	ulint	next)	/*!< in: offset of the next record */
+	MY_ATTRIBUTE((nonnull));
+/******************************************************//**
+The following function is used to get the number of fields
+in an old-style record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields_old(
+/*=================*/
+	const rec_t*	rec)	/*!< in: physical record */
+	MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to get the number of fields
+in a record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields(
+/*=============*/
+	const rec_t*		rec,	/*!< in: physical record */
+	const dict_index_t*	index)	/*!< in: record descriptor */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Confirms the n_fields of the entry is sane with comparing the other
+record in the same page specified
+@param[in]	index	index
+@param[in]	rec	record of the same page
+@param[in]	entry	index entry
+@return	true if n_fields is sane */
+UNIV_INLINE
+bool
+rec_n_fields_is_sane(
+	dict_index_t*	index,
+	const rec_t*	rec,
+	const dtuple_t*	entry)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_old(
+/*================*/
+	const rec_t*	rec)	/*!< in: old-style physical record */
+	MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_new(
+/*================*/
+	const rec_t*	rec)	/*!< in: new-style physical record */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+The following function is used to retrieve the info bits of
+a record.
+@return info bits */
+UNIV_INLINE
+byte
+rec_get_info_bits(
+/*==============*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Determine the status bits of a non-REDUNDANT record.
+@param[in]	rec	ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED record
+@return status bits */
+inline
+rec_comp_status_t
+rec_get_status(const rec_t* rec)
+{
+	byte bits = rec[-REC_NEW_STATUS] & REC_NEW_STATUS_MASK;
+	ut_ad(bits <= REC_STATUS_INSTANT);
+	return static_cast<rec_comp_status_t>(bits);
+}
+
+/** Set the status bits of a non-REDUNDANT record.
+@param[in,out]	rec	ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED record
+@param[in]	bits	status bits */
+inline void rec_set_status(rec_t *rec, byte bits)
+{
+  ut_ad(bits <= REC_STATUS_INSTANT);
+  rec[-REC_NEW_STATUS]= static_cast<byte>((rec[-REC_NEW_STATUS] &
+                                           ~REC_NEW_STATUS_MASK) | bits);
+}
+
+/** Get the length of added field count in a REC_STATUS_INSTANT record.
+@param[in]	n_add_field	number of added fields, minus one
+@return	storage size of the field count, in bytes */
+inline unsigned rec_get_n_add_field_len(ulint n_add_field)
+{
+	ut_ad(n_add_field < REC_MAX_N_FIELDS);
+	return n_add_field < 0x80 ? 1 : 2;
+}
+
+/** Get the added field count in a REC_STATUS_INSTANT record.
+@param[in,out]	header	variable header of a REC_STATUS_INSTANT record
+@return	number of added fields */
+inline unsigned rec_get_n_add_field(const byte*& header)
+{
+	unsigned n_fields_add = *--header;
+	if (n_fields_add < 0x80) {
+		ut_ad(rec_get_n_add_field_len(n_fields_add) == 1);
+		return n_fields_add;
+	}
+
+	n_fields_add &= 0x7f;
+	n_fields_add |= unsigned(*--header) << 7;
+	ut_ad(n_fields_add < REC_MAX_N_FIELDS);
+	ut_ad(rec_get_n_add_field_len(n_fields_add) == 2);
+	return n_fields_add;
+}
+
+/** Set the added field count in a REC_STATUS_INSTANT record.
+@param[in,out]	header	variable header of a REC_STATUS_INSTANT record
+@param[in]	n_add	number of added fields, minus 1
+@return	record header before the number of added fields */
+inline void rec_set_n_add_field(byte*& header, ulint n_add)
+{
+	ut_ad(n_add < REC_MAX_N_FIELDS);
+
+	if (n_add < 0x80) {
+		*header-- = byte(n_add);
+	} else {
+		*header-- = byte(byte(n_add) | 0x80);
+		*header-- = byte(n_add >> 7);
+	}
+}
+
+/******************************************************//**
+The following function is used to retrieve the info and status
+bits of a record.  (Only compact records have status bits.)
+@return info and status bits */
+UNIV_INLINE
+byte
+rec_get_info_and_status_bits(
+/*=========================*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+	MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to set the info and status
+bits of a record.  (Only compact records have status bits.) */
+UNIV_INLINE
+void
+rec_set_info_and_status_bits(
+/*=========================*/
+	rec_t*	rec,	/*!< in/out: compact physical record */
+	ulint	bits)	/*!< in: info bits */
+	MY_ATTRIBUTE((nonnull));
+
+/******************************************************//**
+The following function tells if record is delete marked.
+@return nonzero if delete marked */
+UNIV_INLINE
+ulint
+rec_get_deleted_flag(
+/*=================*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+	MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function tells if a new-style record is a node pointer.
+@return TRUE if node pointer */
+UNIV_INLINE
+bool
+rec_get_node_ptr_flag(
+/*==================*/
+	const rec_t*	rec)	/*!< in: physical record */
+	MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to get the order number
+of an old-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_old(
+/*================*/
+	const rec_t*	rec)	/*!< in: physical record */
+	MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to get the order number
+of a new-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_new(
+/*================*/
+	const rec_t*	rec)	/*!< in: physical record */
+	MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to test whether the data offsets
+in the record are stored in one-byte or two-byte format.
+@return TRUE if 1-byte form */
+UNIV_INLINE
+ibool
+rec_get_1byte_offs_flag(
+/*====================*/
+	const rec_t*	rec)	/*!< in: physical record */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+The following function is used to set the 1-byte offsets flag. */
+UNIV_INLINE
+void
+rec_set_1byte_offs_flag(
+/*====================*/
+	rec_t*	rec,	/*!< in: physical record */
+	ibool	flag)	/*!< in: TRUE if 1byte form */
+	MY_ATTRIBUTE((nonnull));
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag ORed */
+UNIV_INLINE
+uint8_t
+rec_1_get_field_end_info(
+/*=====================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag and extern
+storage flag ORed */
+UNIV_INLINE
+uint16_t
+rec_2_get_field_end_info(
+/*=====================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+Returns nonzero if the field is stored off-page.
+@retval 0 if the field is stored in-page
+@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */
+UNIV_INLINE
+ulint
+rec_2_is_field_extern(
+/*==================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+Determine how many of the first n columns in a compact
+physical record are stored externally.
+@return number of externally stored columns */
+ulint
+rec_get_n_extern_new(
+/*=================*/
+	const rec_t*		rec,	/*!< in: compact physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint			n)	/*!< in: number of columns to scan */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Determine the offsets to each field in an index record.
+@param[in]	rec		physical record
+@param[in]	index		the index that the record belongs to
+@param[in,out]	offsets		array comprising offsets[0] allocated elements,
+				or an array from rec_get_offsets(), or NULL
+@param[in]	n_core		0, or index->n_core_fields for leaf page
+@param[in]	n_fields	maximum number of offsets to compute
+				(ULINT_UNDEFINED to compute all offsets)
+@param[in,out]	heap		memory heap
+@return the new offsets */
+rec_offs*
+rec_get_offsets_func(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	rec_offs*		offsets,
+	ulint			n_core,
+	ulint			n_fields,
+#ifdef UNIV_DEBUG
+	const char*		file,	/*!< in: file name where called */
+	unsigned		line,	/*!< in: line number where called */
+#endif /* UNIV_DEBUG */
+	mem_heap_t**		heap)	/*!< in/out: memory heap */
+#ifdef UNIV_DEBUG
+	MY_ATTRIBUTE((nonnull(1,2,6,8),warn_unused_result));
+#else /* UNIV_DEBUG */
+	MY_ATTRIBUTE((nonnull(1,2,6),warn_unused_result));
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_DEBUG
+# define rec_get_offsets(rec, index, offsets, leaf, n, heap)		\
+	rec_get_offsets_func(rec,index,offsets,leaf,n,__FILE__,__LINE__,heap)
+#else /* UNIV_DEBUG */
+# define rec_get_offsets(rec, index, offsets, leaf, n, heap)		\
+	rec_get_offsets_func(rec, index, offsets, leaf, n, heap)
+#endif /* UNIV_DEBUG */
+
+/******************************************************//**
+The following function determines the offsets to each field
+in the record.  It can reuse a previously allocated array. */
+void
+rec_get_offsets_reverse(
+/*====================*/
+	const byte*		extra,	/*!< in: the extra bytes of a
+					compact record in reverse order,
+					excluding the fixed-size
+					REC_N_NEW_EXTRA_BYTES */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint			node_ptr,/*!< in: nonzero=node pointer,
+					0=leaf node */
+	rec_offs*		offsets)/*!< in/out: array consisting of
+					offsets[0] allocated elements */
+	MY_ATTRIBUTE((nonnull));
+#ifdef UNIV_DEBUG
+/** Validate offsets returned by rec_get_offsets().
+@param[in]	rec	record, or NULL
+@param[in]	index	the index that the record belongs in, or NULL
+@param[in,out]	offsets	the offsets of the record
+@return true */
+bool
+rec_offs_validate(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	const rec_offs*		offsets)
+	MY_ATTRIBUTE((nonnull(3), warn_unused_result));
+/** Update debug data in offsets, in order to tame rec_offs_validate().
+@param[in]	rec	record
+@param[in]	index	the index that the record belongs in
+@param[in]	leaf	whether the record resides in a leaf page
+@param[in,out]	offsets	offsets from rec_get_offsets() to adjust */
+void
+rec_offs_make_valid(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	bool			leaf,
+	rec_offs*		offsets)
+	MY_ATTRIBUTE((nonnull));
+#else
+# define rec_offs_make_valid(rec, index, leaf, offsets)
+#endif /* UNIV_DEBUG */
+
+/************************************************************//**
+The following function is used to get the offset to the nth
+data field in an old-style record.
+@return offset to the field */
+ulint
+rec_get_nth_field_offs_old(
+/*=======================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n,	/*!< in: index of the field */
+	ulint*		len)	/*!< out: length of the field; UNIV_SQL_NULL
+				if SQL null */
+	MY_ATTRIBUTE((nonnull));
+#define rec_get_nth_field_old(rec, n, len) \
+((rec) + rec_get_nth_field_offs_old(rec, n, len))
+/************************************************************//**
+Gets the physical size of an old-style field.
+Also an SQL null may have a field of size > 0,
+if the data type is of a fixed size.
+@return field size in bytes */
+UNIV_INLINE
+ulint
+rec_get_nth_field_size(
+/*===================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: index of the field */
+	MY_ATTRIBUTE((warn_unused_result));
+/************************************************************//**
+The following function is used to get an offset to the nth
+data field in a record.
+@return offset from the origin of rec */
+UNIV_INLINE
+rec_offs
+rec_get_nth_field_offs(
+/*===================*/
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n,	/*!< in: index of the field */
+	ulint*		len)	/*!< out: length of the field; UNIV_SQL_NULL
+				if SQL null */
+	MY_ATTRIBUTE((nonnull));
+#define rec_get_nth_field(rec, offsets, n, len) \
+((rec) + rec_get_nth_field_offs(offsets, n, len))
+/******************************************************//**
+Determine if the offsets are for a record containing null BLOB pointers.
+@return first field containing a null BLOB pointer, or NULL if none found */
+UNIV_INLINE
+const byte*
+rec_offs_any_null_extern(
+/*=====================*/
+	const rec_t*	rec,		/*!< in: record */
+	const rec_offs*	offsets)	/*!< in: rec_get_offsets(rec) */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Mark the nth field as externally stored.
+@param[in]	offsets		array returned by rec_get_offsets()
+@param[in]	n		nth field */
+void
+rec_offs_make_nth_extern(
+        rec_offs*	offsets,
+        const ulint     n);
+
+MY_ATTRIBUTE((nonnull))
+/** Determine the number of allocated elements for an array of offsets.
+@param[in]	offsets		offsets after rec_offs_set_n_alloc()
+@return number of elements */
+inline ulint rec_offs_get_n_alloc(const rec_offs *offsets)
+{
+  ut_ad(offsets);
+  ulint n_alloc= offsets[0];
+  ut_ad(n_alloc > REC_OFFS_HEADER_SIZE);
+  MEM_CHECK_ADDRESSABLE(offsets, n_alloc * sizeof *offsets);
+  return n_alloc;
+}
+
+/** Determine the number of fields for which offsets have been initialized.
+@param[in]	offsets	rec_get_offsets()
+@return number of fields */
+inline
+ulint
+rec_offs_n_fields(const rec_offs* offsets)
+{
+	ulint	n_fields;
+	ut_ad(offsets);
+	n_fields = offsets[1];
+	ut_ad(n_fields > 0);
+	ut_ad(n_fields <= REC_MAX_N_FIELDS);
+	ut_ad(n_fields + REC_OFFS_HEADER_SIZE
+	      <= rec_offs_get_n_alloc(offsets));
+	return(n_fields);
+}
+
+/** Get a flag of a record field.
+@param[in]	offsets	rec_get_offsets()
+@param[in]	n	nth field
+@param[in]	flag	flag to extract
+@return	type of the record field */
+inline field_type_t rec_offs_nth_type(const rec_offs *offsets, ulint n)
+{
+  ut_ad(rec_offs_validate(NULL, NULL, offsets));
+  ut_ad(n < rec_offs_n_fields(offsets));
+  return get_type(rec_offs_base(offsets)[1 + n]);
+}
+
+/** Determine if a record field is missing
+(should be replaced by dict_index_t::instant_field_value()).
+@param[in]	offsets	rec_get_offsets()
+@param[in]	n	nth field
+@return	nonzero if default bit is set */
+inline ulint rec_offs_nth_default(const rec_offs *offsets, ulint n)
+{
+  return rec_offs_nth_type(offsets, n) == DEFAULT;
+}
+
+/** Determine if a record field is SQL NULL
+(should be replaced by dict_index_t::instant_field_value()).
+@param[in]	offsets	rec_get_offsets()
+@param[in]	n	nth field
+@return	nonzero if SQL NULL set */
+inline ulint rec_offs_nth_sql_null(const rec_offs *offsets, ulint n)
+{
+  return rec_offs_nth_type(offsets, n) == SQL_NULL;
+}
+
+/** Determine if a record field is stored off-page.
+@param[in]	offsets	rec_get_offsets()
+@param[in]	n	nth field
+Returns nonzero if the extern bit is set in nth field of rec.
+@return nonzero if externally stored */
+inline ulint rec_offs_nth_extern(const rec_offs *offsets, ulint n)
+{
+  return rec_offs_nth_type(offsets, n) == STORED_OFFPAGE;
+}
+
+/** Get a global flag of a record.
+@param[in]	offsets	rec_get_offsets()
+@param[in]	flag	flag to extract
+@return	the flag of the record field */
+inline ulint rec_offs_any_flag(const rec_offs *offsets, ulint flag)
+{
+  ut_ad(rec_offs_validate(NULL, NULL, offsets));
+  return *rec_offs_base(offsets) & flag;
+}
+
+/** Determine if the offsets are for a record containing off-page columns.
+@param[in]	offsets	rec_get_offsets()
+@return nonzero if any off-page columns exist */
+inline bool rec_offs_any_extern(const rec_offs *offsets)
+{
+  return rec_offs_any_flag(offsets, REC_OFFS_EXTERNAL);
+}
+
+/** Determine if the offsets are for a record that is missing fields.
+@param[in]	offsets	rec_get_offsets()
+@return nonzero if any fields need to be replaced with
+		dict_index_t::instant_field_value() */
+inline ulint rec_offs_any_default(const rec_offs *offsets)
+{
+  return rec_offs_any_flag(offsets, REC_OFFS_DEFAULT);
+}
+
+/** Determine if the offsets are for other than ROW_FORMAT=REDUNDANT.
+@param[in]	offsets	rec_get_offsets()
+@return	nonzero	if ROW_FORMAT is COMPACT,DYNAMIC or COMPRESSED
+@retval	0	if ROW_FORMAT=REDUNDANT */
+inline ulint rec_offs_comp(const rec_offs *offsets)
+{
+  ut_ad(rec_offs_validate(NULL, NULL, offsets));
+  return (*rec_offs_base(offsets) & REC_OFFS_COMPACT);
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ADD COLUMN or ALTER TABLE.
+@param[in]	rec	leaf page record
+@param[in]	comp	0 if ROW_FORMAT=REDUNDANT, else nonzero
+@return	whether the record is the metadata pseudo-record */
+inline bool rec_is_metadata(const rec_t* rec, ulint comp)
+{
+	bool is = !!(rec_get_info_bits(rec, comp) & REC_INFO_MIN_REC_FLAG);
+	ut_ad(!is || !comp || rec_get_status(rec) == REC_STATUS_INSTANT);
+	return is;
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ADD COLUMN or ALTER TABLE.
+@param[in]	rec	leaf page record
+@param[in]	index	index of the record
+@return	whether the record is the metadata pseudo-record */
+inline bool rec_is_metadata(const rec_t *rec, const dict_index_t &index)
+{
+  return rec_is_metadata(rec, index.table->not_redundant());
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ADD COLUMN (not other ALTER TABLE).
+@param[in]	rec	leaf page record
+@param[in]	comp	0 if ROW_FORMAT=REDUNDANT, else nonzero
+@return	whether the record is the metadata pseudo-record */
+inline bool rec_is_add_metadata(const rec_t* rec, ulint comp)
+{
+	bool is = rec_get_info_bits(rec, comp) == REC_INFO_MIN_REC_FLAG;
+	ut_ad(!is || !comp || rec_get_status(rec) == REC_STATUS_INSTANT);
+	return is;
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ADD COLUMN (not other ALTER TABLE).
+@param[in]	rec	leaf page record
+@param[in]	index	index of the record
+@return	whether the record is the metadata pseudo-record */
+inline bool rec_is_add_metadata(const rec_t* rec, const dict_index_t& index)
+{
+	bool is = rec_is_add_metadata(rec, dict_table_is_comp(index.table));
+	ut_ad(!is || index.is_instant());
+	return is;
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ALTER TABLE (not plain ADD COLUMN).
+@param[in]	rec	leaf page record
+@param[in]	comp	0 if ROW_FORMAT=REDUNDANT, else nonzero
+@return	whether the record is the ALTER TABLE metadata pseudo-record */
+inline bool rec_is_alter_metadata(const rec_t* rec, ulint comp)
+{
+	bool is = !(~rec_get_info_bits(rec, comp)
+		    & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG));
+	ut_ad(!is || rec_is_metadata(rec, comp));
+	return is;
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ALTER TABLE (not plain ADD COLUMN).
+@param[in]	rec	leaf page record
+@param[in]	index	index of the record
+@return	whether the record is the ALTER TABLE metadata pseudo-record */
+inline bool rec_is_alter_metadata(const rec_t* rec, const dict_index_t& index)
+{
+	bool is = rec_is_alter_metadata(rec, dict_table_is_comp(index.table));
+	ut_ad(!is || index.is_dummy || index.is_instant());
+	return is;
+}
+
+/** Determine if a record is delete-marked (not a metadata pseudo-record).
+@param[in]	rec	record
+@param[in]	comp	nonzero if ROW_FORMAT!=REDUNDANT
+@return	whether the record is a delete-marked user record */
+inline bool rec_is_delete_marked(const rec_t* rec, ulint comp)
+{
+	return (rec_get_info_bits(rec, comp)
+		& (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG))
+		== REC_INFO_DELETED_FLAG;
+}
+
+/** Get the nth field from an index.
+@param[in]	rec	index record
+@param[in]	index	index
+@param[in]	offsets	rec_get_offsets(rec, index)
+@param[in]	n	field number
+@param[out]	len	length of the field in bytes, or UNIV_SQL_NULL
+@return a read-only copy of the index field */
+inline
+const byte*
+rec_get_nth_cfield(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	const rec_offs*		offsets,
+	ulint			n,
+	ulint*			len)
+{
+	/* Because this function may be invoked by innobase_rec_to_mysql()
+	for reporting a duplicate key during ALTER TABLE or
+	CREATE UNIQUE INDEX, and in that case the rec omit the fixed-size
+	header of 5 or 6 bytes, the check
+	rec_offs_validate(rec, index, offsets) must be avoided here. */
+	if (!rec_offs_nth_default(offsets, n)) {
+		return rec_get_nth_field(rec, offsets, n, len);
+	}
+	return index->instant_field_value(n, len);
+}
+
+/******************************************************//**
+Gets the physical size of a field.
+@return length of field */
+UNIV_INLINE
+ulint
+rec_offs_nth_size(
+/*==============*/
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n)	/*!< in: nth field */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+Returns the number of extern bits set in a record.
+@return number of externally stored fields */
+UNIV_INLINE
+ulint
+rec_offs_n_extern(
+/*==============*/
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+	MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+The following function returns the data size of an old-style physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_data_size_old(
+/*==================*/
+	const rec_t*	rec)	/*!< in: physical record */
+	MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+The following function sets the number of allocated elements
+for an array of offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_alloc(
+/*=================*/
+	rec_offs*offsets,	/*!< out: array for rec_get_offsets(),
+				must be allocated */
+	ulint	n_alloc)	/*!< in: number of elements */
+	MY_ATTRIBUTE((nonnull));
+#define rec_offs_init(offsets) \
+	rec_offs_set_n_alloc(offsets, (sizeof offsets) / sizeof *offsets)
+/**********************************************************//**
+The following function returns the data size of a physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_data_size(
+/*===============*/
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+	MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+Returns the total size of record minus data size of record.
+The value returned by the function is the distance from record
+start to record origin in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_extra_size(
+/*================*/
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+	MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+Returns the total size of a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_size(
+/*==========*/
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+	MY_ATTRIBUTE((warn_unused_result));
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Returns a pointer to the start of the record.
+@return pointer to start */
+UNIV_INLINE
+byte*
+rec_get_start(
+/*==========*/
+	const rec_t*	rec,	/*!< in: pointer to record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+	MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+Returns a pointer to the end of the record.
+@return pointer to end */
+UNIV_INLINE
+byte*
+rec_get_end(
+/*========*/
+	const rec_t*	rec,	/*!< in: pointer to record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+	MY_ATTRIBUTE((warn_unused_result));
+#else /* UNIV_DEBUG */
+# define rec_get_start(rec, offsets) ((rec) - rec_offs_extra_size(offsets))
+# define rec_get_end(rec, offsets) ((rec) + rec_offs_data_size(offsets))
+#endif /* UNIV_DEBUG */
+
+/** Copy a physical record to a buffer.
+@param[in]	buf	buffer
+@param[in]	rec	physical record
+@param[in]	offsets	array returned by rec_get_offsets()
+@return pointer to the origin of the copy */
+UNIV_INLINE
+rec_t*
+rec_copy(
+	void*		buf,
+	const rec_t*	rec,
+	const rec_offs*	offsets);
+
+/** Determine the size of a data tuple prefix in a temporary file.
+@tparam redundant_temp whether to use the ROW_FORMAT=REDUNDANT format
+@param[in]	index		clustered or secondary index
+@param[in]	fields		data fields
+@param[in]	n_fields	number of data fields
+@param[out]	extra		record header size
+@param[in]	status		REC_STATUS_ORDINARY or REC_STATUS_INSTANT
+@return	total size, in bytes */
+template<bool redundant_temp>
+ulint
+rec_get_converted_size_temp(
+	const dict_index_t*	index,
+	const dfield_t*		fields,
+	ulint			n_fields,
+	ulint*			extra,
+	rec_comp_status_t	status = REC_STATUS_ORDINARY)
+	MY_ATTRIBUTE((warn_unused_result, nonnull));
+
+/** Determine the offset to each field in temporary file.
+@param[in]	rec	temporary file record
+@param[in]	index	index of that the record belongs to
+@param[in,out]	offsets	offsets to the fields; in: rec_offs_n_fields(offsets)
+@param[in]	n_core	number of core fields (index->n_core_fields)
+@param[in]	def_val	default values for non-core fields
+@param[in]	status	REC_STATUS_ORDINARY or REC_STATUS_INSTANT */
+void
+rec_init_offsets_temp(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	rec_offs*		offsets,
+	ulint			n_core,
+	const dict_col_t::def_t*def_val,
+	rec_comp_status_t	status = REC_STATUS_ORDINARY)
+	MY_ATTRIBUTE((nonnull(1,2,3)));
+/** Determine the offset to each field in temporary file.
+@param[in]	rec	temporary file record
+@param[in]	index	index of that the record belongs to
+@param[in,out]	offsets	offsets to the fields; in: rec_offs_n_fields(offsets)
+*/
+void
+rec_init_offsets_temp(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	rec_offs*		offsets)
+	MY_ATTRIBUTE((nonnull));
+
+/** Convert a data tuple prefix to the temporary file format.
+@tparam redundant_temp whether to use the ROW_FORMAT=REDUNDANT format
+@param[out]	rec		record in temporary file format
+@param[in]	index		clustered or secondary index
+@param[in]	fields		data fields
+@param[in]	n_fields	number of data fields
+@param[in]	status		REC_STATUS_ORDINARY or REC_STATUS_INSTANT */
+template<bool redundant_temp>
+void
+rec_convert_dtuple_to_temp(
+	rec_t*			rec,
+	const dict_index_t*	index,
+	const dfield_t*		fields,
+	ulint			n_fields,
+	rec_comp_status_t	status = REC_STATUS_ORDINARY)
+	MY_ATTRIBUTE((nonnull));
+
+/**************************************************************//**
+Copies the first n fields of a physical record to a new physical record in
+a buffer.
+@return own: copied record */
+rec_t*
+rec_copy_prefix_to_buf(
+/*===================*/
+	const rec_t*		rec,		/*!< in: physical record */
+	const dict_index_t*	index,		/*!< in: record descriptor */
+	ulint			n_fields,	/*!< in: number of fields
+						to copy */
+	byte**			buf,		/*!< in/out: memory buffer
+						for the copied prefix,
+						or NULL */
+	ulint*			buf_size)	/*!< in/out: buffer size */
+	MY_ATTRIBUTE((nonnull));
+/*********************************************************//**
+Builds a physical record out of a data tuple and
+stores it into the given buffer.
+@return pointer to the origin of physical record */
+rec_t*
+rec_convert_dtuple_to_rec(
+/*======================*/
+	byte*			buf,	/*!< in: start address of the
+					physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*		dtuple,	/*!< in: data tuple */
+	ulint			n_ext)	/*!< in: number of
+					externally stored columns */
+	MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+Returns the extra size of an old-style physical record if we know its
+data size and number of fields.
+@return extra size */
+UNIV_INLINE
+ulint
+rec_get_converted_extra_size(
+/*=========================*/
+	ulint	data_size,	/*!< in: data size */
+	ulint	n_fields,	/*!< in: number of fields */
+	ulint	n_ext)		/*!< in: number of externally stored columns */
+	MY_ATTRIBUTE((const));
+/**********************************************************//**
+Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT.
+@return total size */
+ulint
+rec_get_converted_size_comp_prefix(
+/*===============================*/
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dfield_t*		fields,	/*!< in: array of data fields */
+	ulint			n_fields,/*!< in: number of data fields */
+	ulint*			extra)	/*!< out: extra size */
+	MY_ATTRIBUTE((warn_unused_result, nonnull(1,2)));
+
+/** Determine the size of a record in ROW_FORMAT=COMPACT.
+@param[in]	index		record descriptor. dict_table_is_comp()
+				is assumed to hold, even if it doesn't
+@param[in]	tuple		logical record
+@param[out]	extra		extra size
+@return total size */
+ulint
+rec_get_converted_size_comp(
+	const dict_index_t*	index,
+	const dtuple_t*		tuple,
+	ulint*			extra)
+	MY_ATTRIBUTE((nonnull(1,2)));
+
+/**********************************************************//**
+The following function returns the size of a data tuple when converted to
+a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_converted_size(
+/*===================*/
+	dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+	MY_ATTRIBUTE((warn_unused_result, nonnull));
+/** Copy the first n fields of a (copy of a) physical record to a data tuple.
+The fields are copied into the memory heap.
+@param[out]	tuple		data tuple
+@param[in]	rec		index record, or a copy thereof
+@param[in]	index		index of rec
+@param[in]	n_core		index->n_core_fields at the time rec was
+				copied, or 0 if non-leaf page record
+@param[in]	n_fields	number of fields to copy
+@param[in,out]	heap		memory heap */
+void
+rec_copy_prefix_to_dtuple(
+	dtuple_t*		tuple,
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	ulint			n_core,
+	ulint			n_fields,
+	mem_heap_t*		heap)
+	MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Validates the consistency of a physical record.
+@return TRUE if ok */
+ibool
+rec_validate(
+/*=========*/
+	const rec_t*	rec,	/*!< in: physical record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+	MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Prints an old-style physical record. */
+void
+rec_print_old(
+/*==========*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec)	/*!< in: physical record */
+	MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Prints a spatial index record. */
+void
+rec_print_mbr_rec(
+/*==========*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec,	/*!< in: physical record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+	MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Prints a physical record. */
+void
+rec_print_new(
+/*==========*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec,	/*!< in: physical record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+	MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Prints a physical record. */
+void
+rec_print(
+/*======*/
+	FILE*			file,	/*!< in: file where to print */
+	const rec_t*		rec,	/*!< in: physical record */
+	const dict_index_t*	index)	/*!< in: record descriptor */
+	MY_ATTRIBUTE((nonnull));
+
+/** Pretty-print a record.
+@param[in,out]	o	output stream
+@param[in]	rec	physical record
+@param[in]	info	rec_get_info_bits(rec)
+@param[in]	offsets	rec_get_offsets(rec) */
+void
+rec_print(
+	std::ostream&	o,
+	const rec_t*	rec,
+	ulint		info,
+	const rec_offs*	offsets);
+
+/** Wrapper for pretty-printing a record */
+struct rec_index_print
+{
+	/** Constructor */
+	rec_index_print(const rec_t* rec, const dict_index_t* index) :
+		m_rec(rec), m_index(index)
+	{}
+
+	/** Record */
+	const rec_t*		m_rec;
+	/** Index */
+	const dict_index_t*	m_index;
+};
+
+/** Display a record.
+@param[in,out]	o	output stream
+@param[in]	r	record to display
+@return	the output stream */
+std::ostream&
+operator<<(std::ostream& o, const rec_index_print& r);
+
+/** Wrapper for pretty-printing a record */
+struct rec_offsets_print
+{
+	/** Constructor */
+	rec_offsets_print(const rec_t* rec, const rec_offs* offsets) :
+		m_rec(rec), m_offsets(offsets)
+	{}
+
+	/** Record */
+	const rec_t*		m_rec;
+	/** Offsets to each field */
+	const rec_offs*		m_offsets;
+};
+
+/** Display a record.
+@param[in,out]	o	output stream
+@param[in]	r	record to display
+@return	the output stream */
+ATTRIBUTE_COLD
+std::ostream&
+operator<<(std::ostream& o, const rec_offsets_print& r);
+
+/** Pretty-printer of records and tuples */
+class rec_printer : public std::ostringstream {
+public:
+	/** Construct a pretty-printed record.
+	@param rec	record with header
+	@param offsets	rec_get_offsets(rec, ...) */
+	ATTRIBUTE_COLD
+	rec_printer(const rec_t* rec, const rec_offs* offsets)
+		:
+		std::ostringstream ()
+	{
+		rec_print(*this, rec,
+			  rec_get_info_bits(rec, rec_offs_comp(offsets)),
+			  offsets);
+	}
+
+	/** Construct a pretty-printed record.
+	@param rec record, possibly lacking header
+	@param info rec_get_info_bits(rec)
+	@param offsets rec_get_offsets(rec, ...) */
+	ATTRIBUTE_COLD
+	rec_printer(const rec_t* rec, ulint info, const rec_offs* offsets)
+		:
+		std::ostringstream ()
+	{
+		rec_print(*this, rec, info, offsets);
+	}
+
+	/** Construct a pretty-printed tuple.
+	@param tuple	data tuple */
+	ATTRIBUTE_COLD
+	rec_printer(const dtuple_t* tuple)
+		:
+		std::ostringstream ()
+	{
+		dtuple_print(*this, tuple);
+	}
+
+	/** Construct a pretty-printed tuple.
+	@param field	array of data tuple fields
+	@param n	number of fields */
+	ATTRIBUTE_COLD
+	rec_printer(const dfield_t* field, ulint n)
+		:
+		std::ostringstream ()
+	{
+		dfield_print(*this, field, n);
+	}
+
+	/** Destructor */
+	~rec_printer() override = default;
+
+private:
+	/** Copy constructor */
+	rec_printer(const rec_printer& other);
+	/** Assignment operator */
+	rec_printer& operator=(const rec_printer& other);
+};
+
+
+# ifdef UNIV_DEBUG
+/** Read the DB_TRX_ID of a clustered index record.
+@param[in]	rec	clustered index record
+@param[in]	index	clustered index
+@return the value of DB_TRX_ID */
+trx_id_t
+rec_get_trx_id(
+	const rec_t*		rec,
+	const dict_index_t*	index)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+# endif /* UNIV_DEBUG */
+
+/* Maximum lengths for the data in a physical record if the offsets
+are given in one byte (resp. two byte) format. */
+#define REC_1BYTE_OFFS_LIMIT	0x7FUL
+#define REC_2BYTE_OFFS_LIMIT	0x7FFFUL
+
+/* The data size of record must not be larger than this on
+REDUNDANT row format because we reserve two upmost bits in a
+two byte offset for special purposes */
+#define REDUNDANT_REC_MAX_DATA_SIZE    (16383)
+
+/* The data size of record must be smaller than this on
+COMPRESSED row format because we reserve two upmost bits in a
+two byte offset for special purposes */
+#define COMPRESSED_REC_MAX_DATA_SIZE   (16384)
+
+#ifdef WITH_WSREP
+int wsrep_rec_get_foreign_key(
+	byte 		*buf,     /* out: extracted key */
+	ulint 		*buf_len, /* in/out: length of buf */
+	const rec_t*	rec,	  /* in: physical record */
+	dict_index_t*	index_for,  /* in: index for foreign table */
+	dict_index_t*	index_ref,  /* in: index for referenced table */
+	ibool		new_protocol); /* in: protocol > 1 */
+#endif /* WITH_WSREP */
+
+#include "rem0rec.inl"
+
+#endif /* !UNIV_INNOCHECKSUM */
+#endif /* rem0rec_h */
diff --git a/storage/innobase/include/rem0rec.inl b/storage/innobase/include/rem0rec.inl
new file mode 100644
index 00000000..46c209cb
--- /dev/null
+++ b/storage/innobase/include/rem0rec.inl
@@ -0,0 +1,1134 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0rec.ic
+Record manager
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mach0data.h"
+#include "ut0byte.h"
+#include "dict0boot.h"
+#include "btr0types.h"
+
+/* Offsets of the bit-fields in an old-style record. NOTE! In the table the
+most significant bytes and bits are written below less significant.
+
+	(1) byte offset		(2) bit usage within byte
+	downward from
+	origin ->	1	8 bits pointer to next record
+			2	8 bits pointer to next record
+			3	1 bit short flag
+				7 bits number of fields
+			4	3 bits number of fields
+				5 bits heap number
+			5	8 bits heap number
+			6	4 bits n_owned
+				4 bits info bits
+*/
+
+/* Offsets of the bit-fields in a new-style record. NOTE! In the table the
+most significant bytes and bits are written below less significant.
+
+	(1) byte offset		(2) bit usage within byte
+	downward from
+	origin ->	1	8 bits relative offset of next record
+			2	8 bits relative offset of next record
+				  the relative offset is an unsigned 16-bit
+				  integer:
+				  (offset_of_next_record
+				   - offset_of_this_record) mod 64Ki,
+				  where mod is the modulo as a non-negative
+				  number;
+				  we can calculate the offset of the next
+				  record with the formula:
+				  relative_offset + offset_of_this_record
+				  mod srv_page_size
+			3	3 bits status:
+					000=REC_STATUS_ORDINARY
+					001=REC_STATUS_NODE_PTR
+					010=REC_STATUS_INFIMUM
+					011=REC_STATUS_SUPREMUM
+					100=REC_STATUS_INSTANT
+					1xx=reserved
+				5 bits heap number
+			4	8 bits heap number
+			5	4 bits n_owned
+				4 bits info bits
+*/
+
+/* We list the byte offsets from the origin of the record, the mask,
+and the shift needed to obtain each bit-field of the record. */
+
+#define REC_NEXT		2
+#define REC_NEXT_MASK		0xFFFFUL
+#define REC_NEXT_SHIFT		0
+
+#define REC_OLD_SHORT		3	/* This is single byte bit-field */
+#define REC_OLD_SHORT_MASK	0x1UL
+#define REC_OLD_SHORT_SHIFT	0
+
+#define REC_OLD_N_FIELDS	4
+#define REC_OLD_N_FIELDS_MASK	0x7FEUL
+#define REC_OLD_N_FIELDS_SHIFT	1
+
+#define REC_OLD_HEAP_NO		5
+#define REC_HEAP_NO_MASK	0xFFF8UL
+#if 0 /* defined in rem0rec.h for use of page0zip.cc */
+#define REC_NEW_HEAP_NO		4
+#define	REC_HEAP_NO_SHIFT	3
+#endif
+
+#define REC_OLD_N_OWNED		6	/* This is single byte bit-field */
+#define REC_NEW_N_OWNED		5	/* This is single byte bit-field */
+#define	REC_N_OWNED_MASK	0xFUL
+#define REC_N_OWNED_SHIFT	0
+
+#define REC_OLD_INFO_BITS	6	/* This is single byte bit-field */
+#define REC_NEW_INFO_BITS	5	/* This is single byte bit-field */
+#define	REC_INFO_BITS_MASK	0xF0UL
+#define REC_INFO_BITS_SHIFT	0
+
+#if REC_OLD_SHORT_MASK << (8 * (REC_OLD_SHORT - 3)) \
+		^ REC_OLD_N_FIELDS_MASK << (8 * (REC_OLD_N_FIELDS - 4)) \
+		^ REC_HEAP_NO_MASK << (8 * (REC_OLD_HEAP_NO - 4)) \
+		^ REC_N_OWNED_MASK << (8 * (REC_OLD_N_OWNED - 3)) \
+		^ REC_INFO_BITS_MASK << (8 * (REC_OLD_INFO_BITS - 3)) \
+		^ 0xFFFFFFFFUL
+# error "sum of old-style masks != 0xFFFFFFFFUL"
+#endif
+#if REC_NEW_STATUS_MASK << (8 * (REC_NEW_STATUS - 3)) \
+		^ REC_HEAP_NO_MASK << (8 * (REC_NEW_HEAP_NO - 4)) \
+		^ REC_N_OWNED_MASK << (8 * (REC_NEW_N_OWNED - 3)) \
+		^ REC_INFO_BITS_MASK << (8 * (REC_NEW_INFO_BITS - 3)) \
+		^ 0xFFFFFFUL
+# error "sum of new-style masks != 0xFFFFFFUL"
+#endif
+
+/******************************************************//**
+Gets a bit field from within 1 byte. */
+UNIV_INLINE
+byte
+rec_get_bit_field_1(
+/*================*/
+	const rec_t*	rec,	/*!< in: pointer to record origin */
+	ulint		offs,	/*!< in: offset from the origin down */
+	ulint		mask,	/*!< in: mask used to filter bits */
+	ulint		shift)	/*!< in: shift right applied after masking */
+{
+  return static_cast<byte>((*(rec - offs) & mask) >> shift);
+}
+
+/******************************************************//**
+Sets a bit field within 1 byte. */
+UNIV_INLINE
+void
+rec_set_bit_field_1(
+/*================*/
+	rec_t*	rec,	/*!< in: pointer to record origin */
+	ulint	val,	/*!< in: value to set */
+	ulint	offs,	/*!< in: offset from the origin down */
+	ulint	mask,	/*!< in: mask used to filter bits */
+	ulint	shift)	/*!< in: shift right applied after masking */
+{
+	ut_ad(rec);
+	ut_ad(offs <= REC_N_OLD_EXTRA_BYTES);
+	ut_ad(mask);
+	ut_ad(mask <= 0xFFUL);
+	ut_ad(((mask >> shift) << shift) == mask);
+	ut_ad(((val << shift) & mask) == (val << shift));
+
+	mach_write_to_1(rec - offs,
+			(mach_read_from_1(rec - offs) & ~mask)
+			| (val << shift));
+}
+
+/******************************************************//**
+Gets a bit field from within 2 bytes. */
+UNIV_INLINE
+ulint
+rec_get_bit_field_2(
+/*================*/
+	const rec_t*	rec,	/*!< in: pointer to record origin */
+	ulint		offs,	/*!< in: offset from the origin down */
+	ulint		mask,	/*!< in: mask used to filter bits */
+	ulint		shift)	/*!< in: shift right applied after masking */
+{
+	ut_ad(rec);
+
+	return((mach_read_from_2(rec - offs) & mask) >> shift);
+}
+
+/******************************************************//**
+Sets a bit field within 2 bytes. */
+UNIV_INLINE
+void
+rec_set_bit_field_2(
+/*================*/
+	rec_t*	rec,	/*!< in: pointer to record origin */
+	ulint	val,	/*!< in: value to set */
+	ulint	offs,	/*!< in: offset from the origin down */
+	ulint	mask,	/*!< in: mask used to filter bits */
+	ulint	shift)	/*!< in: shift right applied after masking */
+{
+	ut_ad(rec);
+	ut_ad(offs <= REC_N_OLD_EXTRA_BYTES);
+	ut_ad(mask > 0xFFUL);
+	ut_ad(mask <= 0xFFFFUL);
+	ut_ad((mask >> shift) & 1);
+	ut_ad(0 == ((mask >> shift) & ((mask >> shift) + 1)));
+	ut_ad(((mask >> shift) << shift) == mask);
+	ut_ad(((val << shift) & mask) == (val << shift));
+
+	mach_write_to_2(rec - offs,
+			(mach_read_from_2(rec - offs) & ~mask)
+			| (val << shift));
+}
+
+/******************************************************//**
+The following function is used to get the offset of the next chained record
+on the same page.
+@return the page offset of the next chained record, or 0 if none */
+UNIV_INLINE
+ulint
+rec_get_next_offs(
+/*==============*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+{
+	ulint	field_value;
+	compile_time_assert(REC_NEXT_MASK == 0xFFFFUL);
+	compile_time_assert(REC_NEXT_SHIFT == 0);
+
+	field_value = mach_read_from_2(rec - REC_NEXT);
+
+	if (comp) {
+#if UNIV_PAGE_SIZE_MAX <= 32768
+		/* Note that for 64 KiB pages, field_value can 'wrap around'
+		and the debug assertion is not valid */
+
+		/* In the following assertion, field_value is interpreted
+		as signed 16-bit integer in 2's complement arithmetics.
+		If all platforms defined int16_t in the standard headers,
+		the expression could be written simpler as
+		(int16_t) field_value + ut_align_offset(...) < srv_page_size
+		*/
+		ut_ad((field_value >= 32768
+		       ? field_value - 65536
+		       : field_value)
+		      + ut_align_offset(rec, srv_page_size)
+		      < srv_page_size);
+#endif
+		if (field_value == 0) {
+
+			return(0);
+		}
+
+		/* There must be at least REC_N_NEW_EXTRA_BYTES + 1
+		between each record. */
+		ut_ad((field_value > REC_N_NEW_EXTRA_BYTES
+		       && field_value < 32768)
+		      || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES);
+
+		return(ut_align_offset(rec + field_value, srv_page_size));
+	} else {
+		ut_ad(field_value < srv_page_size);
+
+		return(field_value);
+	}
+}
+
+/******************************************************//**
+The following function is used to set the next record offset field
+of an old-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_old(
+/*==================*/
+	rec_t*	rec,	/*!< in: old-style physical record */
+	ulint	next)	/*!< in: offset of the next record */
+{
+	ut_ad(srv_page_size > next);
+	compile_time_assert(REC_NEXT_MASK == 0xFFFFUL);
+	compile_time_assert(REC_NEXT_SHIFT == 0);
+	mach_write_to_2(rec - REC_NEXT, next);
+}
+
+/******************************************************//**
+The following function is used to set the next record offset field
+of a new-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_new(
+/*==================*/
+	rec_t*	rec,	/*!< in/out: new-style physical record */
+	ulint	next)	/*!< in: offset of the next record */
+{
+	ulint	field_value;
+
+	ut_ad(srv_page_size > next);
+
+	if (!next) {
+		field_value = 0;
+	} else {
+		/* The following two statements calculate
+		next - offset_of_rec mod 64Ki, where mod is the modulo
+		as a non-negative number */
+
+		field_value = (ulint)
+			((lint) next
+			 - (lint) ut_align_offset(rec, srv_page_size));
+		field_value &= REC_NEXT_MASK;
+	}
+
+	mach_write_to_2(rec - REC_NEXT, field_value);
+}
+
+/******************************************************//**
+The following function is used to get the number of fields
+in an old-style record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields_old(
+/*=================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	ulint	ret;
+
+	ut_ad(rec);
+
+	ret = rec_get_bit_field_2(rec, REC_OLD_N_FIELDS,
+				  REC_OLD_N_FIELDS_MASK,
+				  REC_OLD_N_FIELDS_SHIFT);
+	ut_ad(ret <= REC_MAX_N_FIELDS);
+	ut_ad(ret > 0);
+
+	return(ret);
+}
+
+/******************************************************//**
+The following function is used to set the number of fields
+in an old-style record. */
+UNIV_INLINE
+void
+rec_set_n_fields_old(
+/*=================*/
+	rec_t*	rec,		/*!< in: physical record */
+	ulint	n_fields)	/*!< in: the number of fields */
+{
+	ut_ad(rec);
+	ut_ad(n_fields <= REC_MAX_N_FIELDS);
+	ut_ad(n_fields > 0);
+
+	rec_set_bit_field_2(rec, n_fields, REC_OLD_N_FIELDS,
+			    REC_OLD_N_FIELDS_MASK, REC_OLD_N_FIELDS_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to get the number of fields
+in a record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields(
+/*=============*/
+	const rec_t*		rec,	/*!< in: physical record */
+	const dict_index_t*	index)	/*!< in: record descriptor */
+{
+	ut_ad(rec);
+	ut_ad(index);
+
+	if (!dict_table_is_comp(index->table)) {
+		return(rec_get_n_fields_old(rec));
+	}
+
+	switch (rec_get_status(rec)) {
+	case REC_STATUS_INSTANT:
+	case REC_STATUS_ORDINARY:
+		return(dict_index_get_n_fields(index));
+	case REC_STATUS_NODE_PTR:
+		return(dict_index_get_n_unique_in_tree(index) + 1);
+	case REC_STATUS_INFIMUM:
+	case REC_STATUS_SUPREMUM:
+		return(1);
+	}
+
+	ut_error;
+	return(ULINT_UNDEFINED);
+}
+
+/** Confirms the n_fields of the entry is sane with comparing the other
+record in the same page specified
+@param[in]	index	index
+@param[in]	rec	record of the same page
+@param[in]	entry	index entry
+@return	true if n_fields is sane */
+UNIV_INLINE
+bool
+rec_n_fields_is_sane(
+	dict_index_t*	index,
+	const rec_t*	rec,
+	const dtuple_t*	entry)
+{
+	const ulint n_fields = rec_get_n_fields(rec, index);
+
+	return(n_fields == dtuple_get_n_fields(entry)
+	       || (index->is_instant()
+		   && n_fields >= index->n_core_fields)
+	       /* a record for older SYS_INDEXES table
+	       (missing merge_threshold column) is acceptable. */
+	       || (index->table->id == DICT_INDEXES_ID
+		   && n_fields == dtuple_get_n_fields(entry) - 1));
+}
+
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_old(
+/*================*/
+	const rec_t*	rec)	/*!< in: old-style physical record */
+{
+	return(rec_get_bit_field_1(rec, REC_OLD_N_OWNED,
+				   REC_N_OWNED_MASK, REC_N_OWNED_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_new(
+/*================*/
+	const rec_t*	rec)	/*!< in: new-style physical record */
+{
+	return(rec_get_bit_field_1(rec, REC_NEW_N_OWNED,
+				   REC_N_OWNED_MASK, REC_N_OWNED_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to retrieve the info bits of a record.
+@return info bits */
+UNIV_INLINE
+byte
+rec_get_info_bits(
+/*==============*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+{
+	return rec_get_bit_field_1(
+		rec, comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS,
+		REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to retrieve the info and status
+bits of a record.  (Only compact records have status bits.)
+@return info and status bits */
+UNIV_INLINE
+byte
+rec_get_info_and_status_bits(
+/*=========================*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+{
+  compile_time_assert(!((REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT)
+                        & (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)));
+  if (comp)
+    return static_cast<byte>(rec_get_info_bits(rec, TRUE) |
+                             rec_get_status(rec));
+  else
+    return rec_get_info_bits(rec, FALSE);
+}
+/******************************************************//**
+The following function is used to set the info and status
+bits of a record.  (Only compact records have status bits.) */
+UNIV_INLINE
+void
+rec_set_info_and_status_bits(
+/*=========================*/
+	rec_t*	rec,	/*!< in/out: physical record */
+	ulint	bits)	/*!< in: info bits */
+{
+	compile_time_assert(!((REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT)
+			      & (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)));
+	rec_set_status(rec, bits & REC_NEW_STATUS_MASK);
+	rec_set_bit_field_1(rec, bits & ~REC_NEW_STATUS_MASK,
+			    REC_NEW_INFO_BITS,
+			    REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+}
+
+/******************************************************//**
+The following function tells if record is delete marked.
+@return nonzero if delete marked */
+UNIV_INLINE
+ulint
+rec_get_deleted_flag(
+/*=================*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+{
+	if (comp) {
+		return(rec_get_bit_field_1(rec, REC_NEW_INFO_BITS,
+					   REC_INFO_DELETED_FLAG,
+					   REC_INFO_BITS_SHIFT));
+	} else {
+		return(rec_get_bit_field_1(rec, REC_OLD_INFO_BITS,
+					   REC_INFO_DELETED_FLAG,
+					   REC_INFO_BITS_SHIFT));
+	}
+}
+
+/******************************************************//**
+The following function tells if a new-style record is a node pointer.
+@return TRUE if node pointer */
+UNIV_INLINE
+bool
+rec_get_node_ptr_flag(
+/*==================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	return(REC_STATUS_NODE_PTR == rec_get_status(rec));
+}
+
+/******************************************************//**
+The following function is used to get the order number
+of an old-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_old(
+/*================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	return(rec_get_bit_field_2(rec, REC_OLD_HEAP_NO,
+				   REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to get the order number
+of a new-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_new(
+/*================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	return(rec_get_bit_field_2(rec, REC_NEW_HEAP_NO,
+				   REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to test whether the data offsets in the record
+are stored in one-byte or two-byte format.
+@return TRUE if 1-byte form */
+UNIV_INLINE
+ibool
+rec_get_1byte_offs_flag(
+/*====================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	return(rec_get_bit_field_1(rec, REC_OLD_SHORT, REC_OLD_SHORT_MASK,
+				   REC_OLD_SHORT_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the 1-byte offsets flag. */
+UNIV_INLINE
+void
+rec_set_1byte_offs_flag(
+/*====================*/
+	rec_t*	rec,	/*!< in: physical record */
+	ibool	flag)	/*!< in: TRUE if 1byte form */
+{
+	ut_ad(flag <= 1);
+
+	rec_set_bit_field_1(rec, flag, REC_OLD_SHORT, REC_OLD_SHORT_MASK,
+			    REC_OLD_SHORT_SHIFT);
+}
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag ORed */
+UNIV_INLINE
+uint8_t
+rec_1_get_field_end_info(
+/*=====================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(rec_get_1byte_offs_flag(rec));
+	ut_ad(n < rec_get_n_fields_old(rec));
+
+	return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1)));
+}
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag and extern
+storage flag ORed */
+UNIV_INLINE
+uint16_t
+rec_2_get_field_end_info(
+/*=====================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(!rec_get_1byte_offs_flag(rec));
+	ut_ad(n < rec_get_n_fields_old(rec));
+
+	return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2)));
+}
+
+/******************************************************//**
+Returns nonzero if the field is stored off-page.
+@retval 0 if the field is stored in-page
+@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */
+UNIV_INLINE
+ulint
+rec_2_is_field_extern(
+/*==================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	return(rec_2_get_field_end_info(rec, n) & REC_2BYTE_EXTERN_MASK);
+}
+
+/**********************************************************//**
+The following function sets the number of allocated elements
+for an array of offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_alloc(
+/*=================*/
+	rec_offs*offsets,	/*!< out: array for rec_get_offsets(),
+				must be allocated */
+	ulint	n_alloc)	/*!< in: number of elements */
+{
+	ut_ad(n_alloc > REC_OFFS_HEADER_SIZE);
+	MEM_UNDEFINED(offsets, n_alloc * sizeof *offsets);
+	offsets[0] = static_cast<rec_offs>(n_alloc);
+}
+
+/************************************************************//**
+The following function is used to get an offset to the nth
+data field in a record.
+@return offset from the origin of rec */
+UNIV_INLINE
+rec_offs
+rec_get_nth_field_offs(
+/*===================*/
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n,	/*!< in: index of the field */
+	ulint*		len)	/*!< out: length of the field; UNIV_SQL_NULL
+				if SQL null; UNIV_SQL_DEFAULT is default value */
+{
+	ut_ad(n < rec_offs_n_fields(offsets));
+
+	rec_offs offs = n == 0 ? 0 : get_value(rec_offs_base(offsets)[n]);
+	rec_offs next_offs = rec_offs_base(offsets)[1 + n];
+
+	if (get_type(next_offs) == SQL_NULL) {
+		*len = UNIV_SQL_NULL;
+	} else if (get_type(next_offs) == DEFAULT) {
+		*len = UNIV_SQL_DEFAULT;
+	} else {
+		*len = get_value(next_offs) - offs;
+	}
+
+	return(offs);
+}
+
+/******************************************************//**
+Determine if the offsets are for a record containing null BLOB pointers.
+@return first field containing a null BLOB pointer, or NULL if none found */
+UNIV_INLINE
+const byte*
+rec_offs_any_null_extern(
+/*=====================*/
+	const rec_t*	rec,		/*!< in: record */
+	const rec_offs*	offsets)	/*!< in: rec_get_offsets(rec) */
+{
+	ulint	i;
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	if (!rec_offs_any_extern(offsets)) {
+		return(NULL);
+	}
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		if (rec_offs_nth_extern(offsets, i)) {
+			ulint		len;
+			const byte*	field
+				= rec_get_nth_field(rec, offsets, i, &len);
+
+			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+			if (!memcmp(field + len
+				    - BTR_EXTERN_FIELD_REF_SIZE,
+				    field_ref_zero,
+				    BTR_EXTERN_FIELD_REF_SIZE)) {
+				return(field);
+			}
+		}
+	}
+
+	return(NULL);
+}
+
+/******************************************************//**
+Gets the physical size of a field.
+@return length of field */
+UNIV_INLINE
+ulint
+rec_offs_nth_size(
+/*==============*/
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n)	/*!< in: nth field */
+{
+	ut_ad(rec_offs_validate(NULL, NULL, offsets));
+	ut_ad(n < rec_offs_n_fields(offsets));
+	if (!n) {
+		return get_value(rec_offs_base(offsets)[1 + n]);
+	}
+	return get_value((rec_offs_base(offsets)[1 + n]))
+	       - get_value(rec_offs_base(offsets)[n]);
+}
+
+/******************************************************//**
+Returns the number of extern bits set in a record.
+@return number of externally stored fields */
+UNIV_INLINE
+ulint
+rec_offs_n_extern(
+/*==============*/
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	n = 0;
+
+	if (rec_offs_any_extern(offsets)) {
+		ulint	i;
+
+		for (i = rec_offs_n_fields(offsets); i--; ) {
+			if (rec_offs_nth_extern(offsets, i)) {
+				n++;
+			}
+		}
+	}
+
+	return(n);
+}
+
+/******************************************************//**
+Returns the offset of n - 1th field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value. This function and the 2-byte counterpart are defined here because the
+C-compiler was not able to sum negative and positive constant offsets, and
+warned of constant arithmetic overflow within the compiler.
+@return offset of the start of the PREVIOUS field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_1_get_prev_field_end_info(
+/*==========================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(rec_get_1byte_offs_flag(rec));
+	ut_ad(n <= rec_get_n_fields_old(rec));
+
+	return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n)));
+}
+
+/******************************************************//**
+Returns the offset of n - 1th field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the PREVIOUS field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_2_get_prev_field_end_info(
+/*==========================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(!rec_get_1byte_offs_flag(rec));
+	ut_ad(n <= rec_get_n_fields_old(rec));
+
+	return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n)));
+}
+
+/******************************************************//**
+Sets the field end info for the nth field if the record is stored in the
+1-byte format. */
+UNIV_INLINE
+void
+rec_1_set_field_end_info(
+/*=====================*/
+	rec_t*	rec,	/*!< in: record */
+	ulint	n,	/*!< in: field index */
+	ulint	info)	/*!< in: value to set */
+{
+	ut_ad(rec_get_1byte_offs_flag(rec));
+	ut_ad(n < rec_get_n_fields_old(rec));
+
+	mach_write_to_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1), info);
+}
+
+/******************************************************//**
+Sets the field end info for the nth field if the record is stored in the
+2-byte format. */
+UNIV_INLINE
+void
+rec_2_set_field_end_info(
+/*=====================*/
+	rec_t*	rec,	/*!< in: record */
+	ulint	n,	/*!< in: field index */
+	ulint	info)	/*!< in: value to set */
+{
+	ut_ad(!rec_get_1byte_offs_flag(rec));
+	ut_ad(n < rec_get_n_fields_old(rec));
+
+	mach_write_to_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2), info);
+}
+
+/******************************************************//**
+Returns the offset of nth field start if the record is stored in the 1-byte
+offsets form.
+@return offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_1_get_field_start_offs(
+/*=======================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(rec_get_1byte_offs_flag(rec));
+	ut_ad(n <= rec_get_n_fields_old(rec));
+
+	if (n == 0) {
+
+		return(0);
+	}
+
+	return(rec_1_get_prev_field_end_info(rec, n)
+	       & ~REC_1BYTE_SQL_NULL_MASK);
+}
+
+/******************************************************//**
+Returns the offset of nth field start if the record is stored in the 2-byte
+offsets form.
+@return offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_2_get_field_start_offs(
+/*=======================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(!rec_get_1byte_offs_flag(rec));
+	ut_ad(n <= rec_get_n_fields_old(rec));
+
+	if (n == 0) {
+
+		return(0);
+	}
+
+	return(rec_2_get_prev_field_end_info(rec, n)
+	       & ~(REC_2BYTE_SQL_NULL_MASK | REC_2BYTE_EXTERN_MASK));
+}
+
+/******************************************************//**
+The following function is used to read the offset of the start of a data field
+in the record. The start of an SQL null field is the end offset of the
+previous non-null field, or 0, if none exists. If n is the number of the last
+field + 1, then the end offset of the last field is returned.
+@return offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_get_field_start_offs(
+/*=====================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(rec);
+	ut_ad(n <= rec_get_n_fields_old(rec));
+
+	if (n == 0) {
+
+		return(0);
+	}
+
+	if (rec_get_1byte_offs_flag(rec)) {
+
+		return(rec_1_get_field_start_offs(rec, n));
+	}
+
+	return(rec_2_get_field_start_offs(rec, n));
+}
+
+/************************************************************//**
+Gets the physical size of an old-style field.
+Also an SQL null may have a field of size > 0,
+if the data type is of a fixed size.
+@return field size in bytes */
+UNIV_INLINE
+ulint
+rec_get_nth_field_size(
+/*===================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: index of the field */
+{
+	ulint	os;
+	ulint	next_os;
+
+	os = rec_get_field_start_offs(rec, n);
+	next_os = rec_get_field_start_offs(rec, n + 1);
+
+	ut_ad(next_os - os < srv_page_size);
+
+	return(next_os - os);
+}
+
+/**********************************************************//**
+The following function returns the data size of an old-style physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_data_size_old(
+/*==================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	ut_ad(rec);
+
+	return(rec_get_field_start_offs(rec, rec_get_n_fields_old(rec)));
+}
+
+/**********************************************************//**
+The following function sets the number of fields in offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_fields(
+/*==================*/
+	rec_offs*	offsets,	/*!< in/out: array returned by
+				rec_get_offsets() */
+	ulint		n_fields)	/*!< in: number of fields */
+{
+	ut_ad(offsets);
+	ut_ad(n_fields > 0);
+	ut_ad(n_fields <= REC_MAX_N_FIELDS);
+	ut_ad(n_fields + REC_OFFS_HEADER_SIZE
+	      <= rec_offs_get_n_alloc(offsets));
+	offsets[1] = static_cast<rec_offs>(n_fields);
+}
+
+/**********************************************************//**
+The following function returns the data size of a physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_data_size(
+/*===============*/
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	size;
+
+	ut_ad(rec_offs_validate(NULL, NULL, offsets));
+	size = get_value(rec_offs_base(offsets)[rec_offs_n_fields(offsets)]);
+	ut_ad(size < srv_page_size);
+	return(size);
+}
+
+/**********************************************************//**
+Returns the total size of record minus data size of record. The value
+returned by the function is the distance from record start to record origin
+in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_extra_size(
+/*================*/
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	size;
+	ut_ad(rec_offs_validate(NULL, NULL, offsets));
+	size = *rec_offs_base(offsets) & REC_OFFS_MASK;
+	ut_ad(size < srv_page_size);
+	return(size);
+}
+
+/**********************************************************//**
+Returns the total size of a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_size(
+/*==========*/
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	return(rec_offs_data_size(offsets) + rec_offs_extra_size(offsets));
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Returns a pointer to the end of the record.
+@return pointer to end */
+UNIV_INLINE
+byte*
+rec_get_end(
+/*========*/
+	const rec_t*	rec,	/*!< in: pointer to record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	return(const_cast<rec_t*>(rec + rec_offs_data_size(offsets)));
+}
+
+/**********************************************************//**
+Returns a pointer to the start of the record.
+@return pointer to start */
+UNIV_INLINE
+byte*
+rec_get_start(
+/*==========*/
+	const rec_t*	rec,	/*!< in: pointer to record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	return(const_cast<rec_t*>(rec - rec_offs_extra_size(offsets)));
+}
+#endif /* UNIV_DEBUG */
+
+/** Copy a physical record to a buffer.
+@param[in]	buf	buffer
+@param[in]	rec	physical record
+@param[in]	offsets	array returned by rec_get_offsets()
+@return pointer to the origin of the copy */
+UNIV_INLINE
+rec_t*
+rec_copy(
+	void*		buf,
+	const rec_t*	rec,
+	const rec_offs*	offsets)
+{
+	ulint	extra_len;
+	ulint	data_len;
+
+	ut_ad(rec != NULL);
+	ut_ad(buf != NULL);
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(rec_validate(rec, offsets));
+
+	extra_len = rec_offs_extra_size(offsets);
+	data_len = rec_offs_data_size(offsets);
+
+	memcpy(buf, rec - extra_len, extra_len + data_len);
+
+	return((byte*) buf + extra_len);
+}
+
+/**********************************************************//**
+Returns the extra size of an old-style physical record if we know its
+data size and number of fields.
+@return extra size */
+UNIV_INLINE
+ulint
+rec_get_converted_extra_size(
+/*=========================*/
+	ulint	data_size,	/*!< in: data size */
+	ulint	n_fields,	/*!< in: number of fields */
+	ulint	n_ext)		/*!< in: number of externally stored columns */
+{
+	if (!n_ext && data_size <= REC_1BYTE_OFFS_LIMIT) {
+
+		return(REC_N_OLD_EXTRA_BYTES + n_fields);
+	}
+
+	return(REC_N_OLD_EXTRA_BYTES + 2 * n_fields);
+}
+
+/**********************************************************//**
+The following function returns the size of a data tuple when converted to
+a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_converted_size(
+/*===================*/
+	dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+{
+	ulint	data_size;
+	ulint	extra_size;
+
+	ut_ad(dtuple_check_typed(dtuple));
+#ifdef UNIV_DEBUG
+	if (dict_index_is_ibuf(index)) {
+		ut_ad(dtuple->n_fields > 1);
+	} else if ((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK)
+		   == REC_STATUS_NODE_PTR) {
+		ut_ad(dtuple->n_fields - 1
+		      == dict_index_get_n_unique_in_tree_nonleaf(index));
+	} else if (index->table->id == DICT_INDEXES_ID) {
+		/* The column SYS_INDEXES.MERGE_THRESHOLD was
+		instantly added in MariaDB 10.2.2 (MySQL 5.7). */
+		ut_ad(!index->table->is_temporary());
+		ut_ad(index->n_fields == DICT_NUM_FIELDS__SYS_INDEXES);
+		ut_ad(dtuple->n_fields == DICT_NUM_FIELDS__SYS_INDEXES
+		      || dtuple->n_fields
+		      == DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD);
+	} else {
+		ut_ad(dtuple->n_fields >= index->n_core_fields);
+		ut_ad(dtuple->n_fields <= index->n_fields
+		      || dtuple->is_alter_metadata());
+	}
+#endif
+
+	if (dict_table_is_comp(index->table)) {
+		return rec_get_converted_size_comp(index, dtuple, NULL);
+	}
+
+	data_size = dtuple_get_data_size(dtuple, 0);
+
+	/* If primary key is being updated then the new record inherits
+	externally stored fields from the delete-marked old record.
+	In that case, n_ext may be less value than
+	dtuple_get_n_ext(tuple). */
+	ut_ad(n_ext <= dtuple_get_n_ext(dtuple));
+	extra_size = rec_get_converted_extra_size(
+		data_size, dtuple_get_n_fields(dtuple), n_ext);
+
+	return(data_size + extra_size);
+}
diff --git a/storage/innobase/include/rem0types.h b/storage/innobase/include/rem0types.h
new file mode 100644
index 00000000..0e4075a9
--- /dev/null
+++ b/storage/innobase/include/rem0types.h
@@ -0,0 +1,78 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0types.h
+Record manager global types
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef rem0types_h
+#define rem0types_h
+
+/* We define the physical record simply as an array of bytes */
+typedef byte	rec_t;
+
+/** This type represents a field offset in a rec_t* */
+typedef unsigned short int rec_offs;
+
+/* Maximum values for various fields (for non-blob tuples) */
+#define REC_MAX_N_FIELDS	(1024 - 1)
+#define REC_MAX_HEAP_NO		(2 * 8192 - 1)
+#define REC_MAX_N_OWNED		(16 - 1)
+
+/* Maximum number of user defined fields/columns. The reserved columns
+are the ones InnoDB adds internally: DB_ROW_ID, DB_TRX_ID, DB_ROLL_PTR.
+Before MariaDB Server 10.5, we needed "* 2" because mlog_parse_index()
+created a dummy table object possibly, with some of the system columns
+in it, and then adds the 3 system columns (again) using
+dict_table_add_system_columns().
+For now, we will keep this limitation to maintain file format compatibility
+with older versions. */
+#define REC_MAX_N_USER_FIELDS	(REC_MAX_N_FIELDS - DATA_N_SYS_COLS * 2)
+
+/* REC_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and is the maximum
+indexed field length (or indexed prefix length) for indexes on tables of
+ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT format.
+Before we support UTF-8 encodings with mbmaxlen = 4, a UTF-8 character
+may take at most 3 bytes.  So the limit was set to 3*256, so that one
+can create a column prefix index on 256 characters of a TEXT or VARCHAR
+column also in the UTF-8 charset.
+This constant MUST NOT BE CHANGED, or the compatibility of InnoDB data
+files would be at risk! */
+#define REC_ANTELOPE_MAX_INDEX_COL_LEN		768
+
+/** Maximum indexed field length for tables that have atomic BLOBs.
+This (3072) is the maximum index row length allowed, so we cannot create index
+prefix column longer than that. */
+#define REC_VERSION_56_MAX_INDEX_COL_LEN	3072
+
+/** Innodb row types are a subset of the MySQL global enum row_type.
+They are made into their own enum so that switch statements can account
+for each of them. */
+enum rec_format_enum {
+	REC_FORMAT_REDUNDANT	= 0,	/*!< REDUNDANT row format */
+	REC_FORMAT_COMPACT	= 1,	/*!< COMPACT row format */
+	REC_FORMAT_COMPRESSED	= 2,	/*!< COMPRESSED row format */
+	REC_FORMAT_DYNAMIC	= 3	/*!< DYNAMIC row format */
+};
+typedef enum rec_format_enum rec_format_t;
+
+#endif
diff --git a/storage/innobase/include/row0ext.h b/storage/innobase/include/row0ext.h
new file mode 100644
index 00000000..78886332
--- /dev/null
+++ b/storage/innobase/include/row0ext.h
@@ -0,0 +1,101 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ext.h
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#ifndef row0ext_h
+#define row0ext_h
+
+#include "data0types.h"
+#include "mem0mem.h"
+#include "dict0types.h"
+#include "fsp0types.h"
+#include "row0types.h"
+
+/********************************************************************//**
+Creates a cache of column prefixes of externally stored columns.
+@return own: column prefix cache */
+row_ext_t*
+row_ext_create(
+/*===========*/
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	const ulint*	ext,	/*!< in: col_no's of externally stored columns
+				in the InnoDB table object, as reported by
+				dict_col_get_no(); NOT relative to the records
+				in the clustered index */
+	const dict_table_t& table, /*!< in: table */
+	const dtuple_t*	tuple,	/*!< in: data tuple containing the field
+				references of the externally stored
+				columns; must be indexed by col_no;
+				the clustered index record must be
+				covered by a lock or a page latch
+				to prevent deletion (rollback or purge). */
+	mem_heap_t*	heap);	/*!< in: heap where created */
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup_ith(
+/*===============*/
+	const row_ext_t*	ext,	/*!< in/out: column prefix cache */
+	ulint			i,	/*!< in: index of ext->ext[] */
+	ulint*			len);	/*!< out: length of prefix, in bytes,
+					at most the length determined by
+					DICT_MAX_FIELD_LEN_BY_FORMAT() */
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup(
+/*===========*/
+	const row_ext_t*	ext,	/*!< in: column prefix cache */
+	ulint			col,	/*!< in: column number in the InnoDB
+					table object, as reported by
+					dict_col_get_no(); NOT relative to the
+					records in the clustered index */
+	ulint*			len);	/*!< out: length of prefix, in bytes,
+					at most the length determined by
+					DICT_MAX_FIELD_LEN_BY_FORMAT() */
+
+/** Prefixes of externally stored columns */
+struct row_ext_t{
+	ulint		n_ext;	/*!< number of externally stored columns */
+	const ulint*	ext;	/*!< col_no's of externally stored columns */
+	byte*		buf;	/*!< backing store of the column prefix cache */
+	ulint		max_len;/*!< maximum prefix length, it could be
+				REC_ANTELOPE_MAX_INDEX_COL_LEN or
+				REC_VERSION_56_MAX_INDEX_COL_LEN depending
+				on row format */
+	ulint		zip_size;/*!< ROW_FORMAT=COMPRESSED page size, or 0 */
+	ulint		len[1];	/*!< prefix lengths; 0 if not cached */
+};
+
+#include "row0ext.inl"
+
+#endif
diff --git a/storage/innobase/include/row0ext.inl b/storage/innobase/include/row0ext.inl
new file mode 100644
index 00000000..913b51b3
--- /dev/null
+++ b/storage/innobase/include/row0ext.inl
@@ -0,0 +1,87 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ext.ic
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#include "rem0types.h"
+#include "btr0types.h"
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup_ith(
+/*===============*/
+	const row_ext_t*	ext,	/*!< in/out: column prefix cache */
+	ulint			i,	/*!< in: index of ext->ext[] */
+	ulint*			len)	/*!< out: length of prefix, in bytes,
+					at most ext->max_len */
+{
+	ut_ad(ext);
+	ut_ad(len);
+	ut_ad(i < ext->n_ext);
+
+	*len = ext->len[i];
+
+	ut_ad(*len <= ext->max_len);
+	ut_ad(ext->max_len > 0);
+
+	if (*len == 0) {
+		/* The BLOB could not be fetched to the cache. */
+		return(field_ref_zero);
+	} else {
+		return(ext->buf + i * ext->max_len);
+	}
+}
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup(
+/*===========*/
+	const row_ext_t*	ext,	/*!< in: column prefix cache */
+	ulint			col,	/*!< in: column number in the InnoDB
+					table object, as reported by
+					dict_col_get_no(); NOT relative to the
+					records in the clustered index */
+	ulint*			len)	/*!< out: length of prefix, in bytes,
+					at most ext->max_len */
+{
+	ulint	i;
+
+	ut_ad(ext);
+	ut_ad(len);
+
+	for (i = 0; i < ext->n_ext; i++) {
+		if (col == ext->ext[i]) {
+			return(row_ext_lookup_ith(ext, i, len));
+		}
+	}
+
+	return(NULL);
+}
diff --git a/storage/innobase/include/row0ftsort.h b/storage/innobase/include/row0ftsort.h
new file mode 100644
index 00000000..3ffa8243
--- /dev/null
+++ b/storage/innobase/include/row0ftsort.h
@@ -0,0 +1,268 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ftsort.h
+Create Full Text Index with (parallel) merge sort
+
+Created 10/13/2010 Jimmy Yang
+*******************************************************/
+
+#ifndef row0ftsort_h
+#define row0ftsort_h
+
+#include "data0data.h"
+#include "fts0fts.h"
+#include "fts0priv.h"
+#include "rem0types.h"
+#include "row0merge.h"
+#include "btr0bulk.h"
+#include "srv0srv.h"
+
+/** This structure defineds information the scan thread will fetch
+and put to the linked list for parallel tokenization/sort threads
+to process */
+typedef struct fts_doc_item     fts_doc_item_t;
+
+/** Information about temporary files used in merge sort */
+struct fts_doc_item {
+	dfield_t*	field;		/*!< field contains document string */
+	doc_id_t	doc_id;		/*!< document ID */
+	UT_LIST_NODE_T(fts_doc_item_t)	doc_list;
+					/*!< list of doc items */
+};
+
+/** This defines the list type that scan thread would feed the parallel
+tokenization threads and sort threads. */
+typedef UT_LIST_BASE_NODE_T(fts_doc_item_t)     fts_doc_list_t;
+
+#define FTS_PLL_MERGE		1
+
+/** Sort information passed to each individual parallel sort thread */
+struct fts_psort_t;
+
+/** Common info passed to each parallel sort thread */
+struct fts_psort_common_t {
+	row_merge_dup_t*	dup;		/*!< descriptor of FTS index */
+	dict_table_t*		new_table;	/*!< source table */
+	/** Old table page size */
+	ulint			old_zip_size;
+	trx_t*			trx;		/*!< transaction */
+	fts_psort_t*		all_info;	/*!< all parallel sort info */
+	pthread_cond_t		sort_cond;	/*!< sort completion */
+	ibool			opt_doc_id_size;/*!< whether to use 4 bytes
+						instead of 8 bytes integer to
+						store Doc ID during sort, if
+						Doc ID will not be big enough
+						to use 8 bytes value */
+};
+
+struct fts_psort_t {
+	ulint			psort_id;	/*!< Parallel sort ID */
+	row_merge_buf_t*	merge_buf[FTS_NUM_AUX_INDEX];
+						/*!< sort buffer */
+	merge_file_t*		merge_file[FTS_NUM_AUX_INDEX];
+						/*!< sort file */
+	row_merge_block_t*	merge_block[FTS_NUM_AUX_INDEX];
+						/*!< buffer to write to file */
+	row_merge_block_t*	crypt_block[FTS_NUM_AUX_INDEX];
+						/*!< buffer to crypt data */
+	ulint			child_status;	/*!< child task status */
+	ulint			state;		/*!< parent state */
+	fts_doc_list_t		fts_doc_list;	/*!< doc list to process */
+	fts_psort_common_t*	psort_common;	/*!< ptr to all psort info */
+	tpool::waitable_task*	task;	/*!< threadpool task */
+	dberr_t			error;		/*!< db error during psort */
+	ulint			memory_used;	/*!< memory used by fts_doc_list */
+	mysql_mutex_t		mutex;		/*!< mutex for fts_doc_list */
+};
+
+/** Row fts token for plugin parser */
+struct row_fts_token_t {
+	fts_string_t*	text;		/*!< token */
+	UT_LIST_NODE_T(row_fts_token_t)
+			token_list;	/*!< next token link */
+};
+
+typedef UT_LIST_BASE_NODE_T(row_fts_token_t)     fts_token_list_t;
+
+/** Structure stores information from string tokenization operation */
+struct fts_tokenize_ctx {
+	/** the processed string length in bytes
+	(when using the built-in tokenizer),
+	or the number of row_merge_fts_doc_tokenize_by_parser() calls */
+	ulint			processed_len;
+	ulint			init_pos;       /*!< doc start position */
+	ulint			buf_used;       /*!< the sort buffer (ID) when
+						tokenization stops, which
+						could due to sort buffer full */
+	ulint			rows_added[FTS_NUM_AUX_INDEX];
+						/*!< number of rows added for
+						each FTS index partition */
+	ib_rbt_t*		cached_stopword;/*!< in: stopword list */
+	dfield_t		sort_field[FTS_NUM_FIELDS_SORT];
+						/*!< in: sort field */
+	/** parsed tokens (when using an external parser) */
+	fts_token_list_t	fts_token_list;
+
+	fts_tokenize_ctx() :
+		processed_len(0), init_pos(0), buf_used(0),
+		rows_added(), cached_stopword(NULL), sort_field(),
+		fts_token_list()
+	{
+		memset(rows_added, 0, sizeof rows_added);
+		memset(sort_field, 0, sizeof sort_field);
+		UT_LIST_INIT(fts_token_list, &row_fts_token_t::token_list);
+	}
+};
+
+typedef struct fts_tokenize_ctx fts_tokenize_ctx_t;
+
+/** Structure stores information needed for the insertion phase of FTS
+parallel sort. */
+struct fts_psort_insert {
+	CHARSET_INFO*	charset;	/*!< charset info */
+	mem_heap_t*	heap;		/*!< heap */
+	ibool		opt_doc_id_size;/*!< Whether to use smaller (4 bytes)
+					integer for Doc ID */
+	BtrBulk*	btr_bulk;	/*!< Bulk load instance */
+	dtuple_t*	tuple;		/*!< Tuple to insert */
+
+#ifdef UNIV_DEBUG
+	ulint		aux_index_id;	/*!< Auxiliary index id */
+#endif
+};
+
+typedef struct fts_psort_insert	fts_psort_insert_t;
+
+
+/** status bit used for communication between parent and child thread */
+#define FTS_PARENT_COMPLETE	1
+#define FTS_PARENT_EXITING	2
+#define FTS_CHILD_COMPLETE	1
+
+/** Print some debug information */
+#define	FTSORT_PRINT
+
+#ifdef	FTSORT_PRINT
+#define	DEBUG_FTS_SORT_PRINT(str)		\
+	do {					\
+		ut_print_timestamp(stderr);	\
+		fprintf(stderr, str);		\
+	} while (0)
+#else
+#define DEBUG_FTS_SORT_PRINT(str)
+#endif	/* FTSORT_PRINT */
+
+/*************************************************************//**
+Create a temporary "fts sort index" used to merge sort the
+tokenized doc string. The index has three "fields":
+
+1) Tokenized word,
+2) Doc ID
+3) Word's position in original 'doc'.
+
+@return dict_index_t structure for the fts sort index */
+dict_index_t*
+row_merge_create_fts_sort_index(
+/*============================*/
+	dict_index_t*	index,	/*!< in: Original FTS index
+				based on which this sort index
+				is created */
+	dict_table_t*	table,	/*!< in,out: table that FTS index
+				is being created on */
+	ibool*		opt_doc_id_size);
+				/*!< out: whether to use 4 bytes
+				instead of 8 bytes integer to
+				store Doc ID during sort */
+
+/** Initialize FTS parallel sort structures.
+@param[in]	trx		transaction
+@param[in,out]	dup		descriptor of FTS index being created
+@param[in]	new_table	table where indexes are created
+@param[in]	opt_doc_id_size	whether to use 4 bytes instead of 8 bytes
+				integer to store Doc ID during sort
+@param[in]	old_zip_size	page size of the old table during alter
+@param[out]	psort		parallel sort info to be instantiated
+@param[out]	merge		parallel merge info to be instantiated
+@return true if all successful */
+bool
+row_fts_psort_info_init(
+	trx_t*		trx,
+	row_merge_dup_t*dup,
+	dict_table_t*	new_table,
+	bool		opt_doc_id_size,
+	ulint		old_zip_size,
+	fts_psort_t**	psort,
+	fts_psort_t**	merge)
+	MY_ATTRIBUTE((nonnull));
+
+/********************************************************************//**
+Clean up and deallocate FTS parallel sort structures, and close
+temparary merge sort files */
+void
+row_fts_psort_info_destroy(
+/*=======================*/
+	fts_psort_t*	psort_info,	/*!< parallel sort info */
+	fts_psort_t*	merge_info);	/*!< parallel merge info */
+/********************************************************************//**
+Free up merge buffers when merge sort is done */
+void
+row_fts_free_pll_merge_buf(
+/*=======================*/
+	fts_psort_t*	psort_info);	/*!< in: parallel sort info */
+
+/*********************************************************************//**
+Start the parallel tokenization and parallel merge sort */
+void
+row_fts_start_psort(
+/*================*/
+	fts_psort_t*	psort_info);	/*!< in: parallel sort info */
+/*********************************************************************//**
+Kick off the parallel merge and insert thread */
+void
+row_fts_start_parallel_merge(
+/*=========================*/
+	fts_psort_t*	merge_info);	/*!< in: parallel sort info */
+/********************************************************************//**
+Propagate a newly added record up one level in the selection tree
+@return parent where this value propagated to */
+int
+row_merge_fts_sel_propagate(
+/*========================*/
+	int		propogated,	/*<! in: tree node propagated */
+	int*		sel_tree,	/*<! in: selection tree */
+	ulint		level,		/*<! in: selection tree level */
+	const mrec_t**	 mrec,		/*<! in: sort record */
+	rec_offs**	offsets,	/*<! in: record offsets */
+	dict_index_t*	index);		/*<! in: FTS index */
+/********************************************************************//**
+Read sorted file containing index data tuples and insert these data
+tuples to the index
+@return DB_SUCCESS or error number */
+dberr_t
+row_fts_merge_insert(
+/*=================*/
+	dict_index_t*	index,		/*!< in: index */
+	dict_table_t*	table,		/*!< in: new table */
+	fts_psort_t*	psort_info,	/*!< parallel sort info */
+	ulint		id)		/* !< in: which auxiliary table's data
+					to insert to */
+	MY_ATTRIBUTE((nonnull));
+#endif /* row0ftsort_h */
diff --git a/storage/innobase/include/row0import.h b/storage/innobase/include/row0import.h
new file mode 100644
index 00000000..fd2651da
--- /dev/null
+++ b/storage/innobase/include/row0import.h
@@ -0,0 +1,67 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0import.h
+Header file for import tablespace functions.
+
+Created 2012-02-08 by Sunny Bains
+*******************************************************/
+
+#ifndef row0import_h
+#define row0import_h
+
+#include "dict0types.h"
+
+// Forward declarations
+struct trx_t;
+struct dict_table_t;
+struct row_prebuilt_t;
+
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return error code or DB_SUCCESS */
+dberr_t
+row_import_for_mysql(
+/*=================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct
+						in MySQL */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Update the DICT_TF2_DISCARDED flag in SYS_TABLES.MIX_LEN.
+@param[in,out]	trx		dictionary transaction
+@param[in]	table_id	table identifier
+@param[in]	discarded	whether to set or clear the flag
+@return DB_SUCCESS or error code */
+dberr_t row_import_update_discarded_flag(trx_t* trx, table_id_t table_id,
+					 bool discarded)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Update the root page numbers and tablespace ID of a table.
+@param[in,out]	trx	dictionary transaction
+@param[in,out]	table	persistent table
+@param[in]	reset	whether to reset the fields to FIL_NULL
+@return DB_SUCCESS or error code */
+dberr_t
+row_import_update_index_root(trx_t* trx, dict_table_t* table, bool reset)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#endif /* row0import_h */
diff --git a/storage/innobase/include/row0ins.h b/storage/innobase/include/row0ins.h
new file mode 100644
index 00000000..ac2479c4
--- /dev/null
+++ b/storage/innobase/include/row0ins.h
@@ -0,0 +1,224 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ins.h
+Insert into a table
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0ins_h
+#define row0ins_h
+
+#include "data0data.h"
+#include "que0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include <vector>
+
+/***************************************************************//**
+Checks if foreign key constraint fails for an index entry. Sets shared locks
+which lock either the success or the failure of the constraint. NOTE that
+the caller must have a shared latch on dict_foreign_key_check_lock.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_NO_REFERENCED_ROW, or
+DB_ROW_IS_REFERENCED */
+dberr_t
+row_ins_check_foreign_constraint(
+/*=============================*/
+	ibool		check_ref,/*!< in: TRUE If we want to check that
+				the referenced table is ok, FALSE if we
+				want to check the foreign key table */
+	dict_foreign_t*	foreign,/*!< in: foreign constraint; NOTE that the
+				tables mentioned in it must be in the
+				dictionary cache if they exist at all */
+	dict_table_t*	table,	/*!< in: if check_ref is TRUE, then the foreign
+				table, else the referenced table */
+	dtuple_t*	entry,	/*!< in: index entry for index */
+	que_thr_t*	thr)	/*!< in: query thread */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Sets a new row to insert for an INS_DIRECT node. This function is only used
+if we have constructed the row separately, which is a rare case; this
+function is quite slow. */
+void
+ins_node_set_new_row(
+/*=================*/
+	ins_node_t*	node,	/*!< in: insert node */
+	dtuple_t*	row);	/*!< in: new row (or first row) for the node */
+/***************************************************************//**
+Tries to insert an entry into a clustered index, ignoring foreign key
+constraints. If a record with the same unique key is found, the other
+record is necessarily marked deleted by a committed transaction, or a
+unique key violation error occurs. The delete marked record is then
+updated to an existing record, and we must write an undo log record on
+the delete marked record.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed
+@return error code */
+dberr_t
+row_ins_clust_index_entry_low(
+/*==========================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		n_uniq,	/*!< in: 0 or index->n_uniq */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr)	/*!< in: query thread or NULL */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/***************************************************************//**
+Tries to insert an entry into a secondary index. If a record with exactly the
+same fields is found, the other record is necessarily marked deleted.
+It is then unmarked. Otherwise, the entry is just inserted to the index.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_INSERT_TREE is needed
+@return error code */
+dberr_t
+row_ins_sec_index_entry_low(
+/*========================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF or BTR_INSERT_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: secondary index */
+	mem_heap_t*	offsets_heap,
+				/*!< in/out: memory heap that can be emptied */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	trx_id_t	trx_id,	/*!< in: PAGE_MAX_TRX_ID during
+				row_log_table_apply(), or 0 */
+	que_thr_t*	thr)	/*!< in: query thread */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/***************************************************************//**
+Inserts an entry into a clustered index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+dberr_t
+row_ins_clust_index_entry(
+/*======================*/
+	dict_index_t*	index,	/*!< in: clustered index */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+	MY_ATTRIBUTE((warn_unused_result));
+/***************************************************************//**
+Inserts an entry into a secondary index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+dberr_t
+row_ins_sec_index_entry(
+/*====================*/
+	dict_index_t*	index,	/*!< in: secondary index */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	bool		check_foreign = true) /*!< in: true if check
+				foreign table is needed, false otherwise */
+	MY_ATTRIBUTE((warn_unused_result));
+/***********************************************************//**
+Inserts a row to a table. This is a high-level function used in
+SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_ins_step(
+/*=========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+/* Insert node types */
+#define INS_SEARCHED	0	/* INSERT INTO ... SELECT ... */
+#define INS_VALUES	1	/* INSERT INTO ... VALUES ... */
+#define INS_DIRECT	2	/* this is for internal use in dict0crea:
+				insert the row directly */
+
+/* Node execution states */
+#define	INS_NODE_SET_IX_LOCK	1	/* we should set an IX lock on table */
+#define INS_NODE_ALLOC_ROW_ID	2	/* row id should be allocated */
+#define	INS_NODE_INSERT_ENTRIES 3	/* index entries should be built and
+					inserted */
+
+struct row_prebuilt_t;
+
+/** Insert node structure */
+struct ins_node_t
+{
+	explicit ins_node_t(ulint ins_type, dict_table_t *table) :
+		common(QUE_NODE_INSERT, NULL),
+		ins_type(ins_type),
+		row(NULL), table(table), select(NULL), values_list(NULL),
+		state(INS_NODE_SET_IX_LOCK), index(NULL),
+		entry_list(), entry(entry_list.end()),
+		trx_id(0), entry_sys_heap(mem_heap_create(128))
+	{
+	}
+	~ins_node_t() { mem_heap_free(entry_sys_heap); }
+	que_common_t common;	 /*!< node type: QUE_NODE_INSERT */
+	ulint		ins_type;/* INS_VALUES, INS_SEARCHED, or INS_DIRECT */
+	dtuple_t*	row;	/*!< row to insert */
+	dict_table_t*	table;	/*!< table where to insert */
+	sel_node_t*	select;	/*!< select in searched insert */
+	que_node_t*	values_list;/* list of expressions to evaluate and
+				insert in an INS_VALUES insert */
+	ulint		state;	/*!< node execution state */
+	dict_index_t*	index;	/*!< NULL, or the next index where the index
+				entry should be inserted */
+	std::vector<dtuple_t*>
+			entry_list;/* list of entries, one for each index */
+	std::vector<dtuple_t*>::iterator
+			entry;	/*!< NULL, or entry to insert in the index;
+				after a successful insert of the entry,
+				this should be reset to NULL */
+	/** buffer for the system columns */
+	byte		sys_buf[DATA_ROW_ID_LEN
+				+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
+	trx_id_t	trx_id;	/*!< trx id or the last trx which executed the
+				node */
+	byte		vers_start_buf[8]; /* Buffers for System Versioning */
+	byte		vers_end_buf[8];   /* system fields. */
+	mem_heap_t*	entry_sys_heap;
+				/* memory heap used as auxiliary storage;
+				entry_list and sys fields are stored here;
+				if this is NULL, entry list should be created
+				and buffers for sys fields in row allocated */
+        void vers_update_end(row_prebuilt_t *prebuilt, bool history_row);
+};
+
+/** Create an insert object.
+@param ins_type     INS_VALUES, ...
+@param table        table where to insert
+@param heap         memory heap
+@return the created object */
+inline ins_node_t *ins_node_create(ulint ins_type, dict_table_t *table,
+                                   mem_heap_t *heap)
+{
+  return new (mem_heap_alloc(heap, sizeof(ins_node_t)))
+    ins_node_t(ins_type, table);
+}
+
+#endif
diff --git a/storage/innobase/include/row0log.h b/storage/innobase/include/row0log.h
new file mode 100644
index 00000000..469f1f8a
--- /dev/null
+++ b/storage/innobase/include/row0log.h
@@ -0,0 +1,239 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0log.h
+Modification log for online index creation and online table rebuild
+
+Created 2011-05-26 Marko Makela
+*******************************************************/
+
+#pragma once
+
+#include "que0types.h"
+#include "mtr0types.h"
+#include "row0types.h"
+#include "rem0types.h"
+#include "dict0dict.h"
+#include "trx0types.h"
+#include "trx0undo.h"
+
+class ut_stage_alter_t;
+
+extern Atomic_counter<ulint> onlineddl_rowlog_rows;
+extern ulint onlineddl_rowlog_pct_used;
+extern ulint onlineddl_pct_progress;
+
+/******************************************************//**
+Allocate the row log for an index and flag the index
+for online creation.
+@retval true if success, false if not */
+bool
+row_log_allocate(
+/*=============*/
+	const trx_t*	trx,	/*!< in: the ALTER TABLE transaction */
+	dict_index_t*	index,	/*!< in/out: index */
+	dict_table_t*	table,	/*!< in/out: new table being rebuilt,
+				or NULL when creating a secondary index */
+	bool		same_pk,/*!< in: whether the definition of the
+				PRIMARY KEY has remained the same */
+	const dtuple_t*	defaults,
+				/*!< in: default values of
+				added, changed columns, or NULL */
+	const ulint*	col_map,/*!< in: mapping of old column
+				numbers to new ones, or NULL if !table */
+	const char*	path,	/*!< in: where to create temporary file */
+	const TABLE*	old_table,	/*!< in:table definition before alter */
+	bool		allow_not_null) /*!< in: allow null to non-null
+					conversion */
+	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/******************************************************//**
+Free the row log for an index that was being created online. */
+void
+row_log_free(
+/*=========*/
+	row_log_t*	log)	/*!< in,own: row log */
+	MY_ATTRIBUTE((nonnull));
+
+/******************************************************//**
+Free the row log for an index on which online creation was aborted. */
+inline void row_log_abort_sec(dict_index_t *index)
+{
+  ut_ad(index->lock.have_u_or_x());
+  ut_ad(!index->is_clust());
+  dict_index_set_online_status(index, ONLINE_INDEX_ABORTED);
+  row_log_free(index->online_log);
+  index->online_log= nullptr;
+}
+
+/** Logs an operation to a secondary index that is (or was) being created.
+@param	index	index, S or X latched
+@param	tuple	index tuple
+@param	trx_id	transaction ID for insert, or 0 for delete
+@retval false if row_log_apply() failure happens
+or true otherwise */
+bool row_log_online_op(dict_index_t *index, const dtuple_t *tuple,
+                       trx_id_t trx_id) ATTRIBUTE_COLD;
+
+/******************************************************//**
+Gets the error status of the online index rebuild log.
+@return DB_SUCCESS or error code */
+dberr_t
+row_log_table_get_error(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: clustered index of a table
+					that is being rebuilt online */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Check whether a virtual column is indexed in the new table being
+created during alter table
+@param[in]	index	cluster index
+@param[in]	v_no	virtual column number
+@return true if it is indexed, else false */
+bool
+row_log_col_is_indexed(
+	const dict_index_t*	index,
+	ulint			v_no);
+
+/******************************************************//**
+Logs a delete operation to a table that is being rebuilt.
+This will be merged in row_log_table_apply_delete(). */
+void
+row_log_table_delete(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	const byte*	sys)	/*!< in: DB_TRX_ID,DB_ROLL_PTR that should
+				be logged, or NULL to use those in rec */
+	ATTRIBUTE_COLD __attribute__((nonnull(1,2,3)));
+
+/******************************************************//**
+Logs an update operation to a table that is being rebuilt.
+This will be merged in row_log_table_apply_update(). */
+void
+row_log_table_update(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	const dtuple_t*	old_pk);/*!< in: row_log_table_get_pk()
+				before the update */
+
+/******************************************************//**
+Constructs the old PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR
+of a table that is being rebuilt.
+@return tuple of PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in the rebuilt table,
+or NULL if the PRIMARY KEY definition does not change */
+const dtuple_t*
+row_log_table_get_pk(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec,index),
+				or NULL */
+	byte*		sys,	/*!< out: DB_TRX_ID,DB_ROLL_PTR for
+				row_log_table_delete(), or NULL */
+	mem_heap_t**	heap)	/*!< in/out: memory heap where allocated */
+	ATTRIBUTE_COLD __attribute__((nonnull(1,2,5), warn_unused_result));
+
+/******************************************************//**
+Logs an insert to a table that is being rebuilt.
+This will be merged in row_log_table_apply_insert(). */
+void
+row_log_table_insert(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const rec_offs*	offsets);/*!< in: rec_get_offsets(rec,index) */
+
+/** Apply the row_log_table log to a table upon completing rebuild.
+@param[in]	thr		query graph
+@param[in]	old_table	old table
+@param[in,out]	table		MySQL table (for reporting duplicates)
+@param[in,out]	stage		performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_log_table() will be called initially and then
+stage->inc() will be called for each block of log that is applied.
+@param[in]	new_table	Altered table
+@return DB_SUCCESS, or error code on failure */
+dberr_t
+row_log_table_apply(
+	que_thr_t*		thr,
+	dict_table_t*		old_table,
+	struct TABLE*		table,
+	ut_stage_alter_t*	stage,
+	dict_table_t*		new_table)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+Get the latest transaction ID that has invoked row_log_online_op()
+during online creation.
+@return latest transaction ID, or 0 if nothing was logged */
+trx_id_t
+row_log_get_max_trx(
+/*================*/
+	dict_index_t*	index)	/*!< in: index, must be locked */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Apply the row log to the index upon completing index creation.
+@param[in]	trx	transaction (for checking if the operation was
+interrupted)
+@param[in,out]	index	secondary index
+@param[in,out]	table	MySQL table (for reporting duplicates)
+@param[in,out]	stage	performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_log_index() will be called initially and then
+stage->inc() will be called for each block of log that is applied.
+@return DB_SUCCESS, or error code on failure */
+dberr_t
+row_log_apply(
+	const trx_t*		trx,
+	dict_index_t*		index,
+	struct TABLE*		table,
+	ut_stage_alter_t*	stage)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Get the n_core_fields of online log for the index
+@param	 index	index whose n_core_fields of log to be accessed
+@return number of n_core_fields */
+unsigned row_log_get_n_core_fields(const dict_index_t *index);
+
+/** Get the error code of online log for the index
+@param	index	online index
+@return error code present in online log */
+dberr_t row_log_get_error(const dict_index_t *index);
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+/** Estimate how much work is to be done by the log apply phase
+of an ALTER TABLE for this index.
+@param[in]	index	index whose log to assess
+@return work to be done by log-apply in abstract units
+*/
+ulint
+row_log_estimate_work(
+	const dict_index_t*	index);
+#endif /* HAVE_PSI_STAGE_INTERFACE */
diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h
new file mode 100644
index 00000000..93ea650d
--- /dev/null
+++ b/storage/innobase/include/row0merge.h
@@ -0,0 +1,496 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0merge.h
+Index build routines using a merge sort
+
+Created 13/06/2005 Jan Lindstrom
+*******************************************************/
+
+#pragma once
+
+#include "que0types.h"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "rem0types.h"
+#include "rem0rec.h"
+#include "btr0types.h"
+#include "row0mysql.h"
+#include "lock0types.h"
+#include "srv0srv.h"
+
+class ut_stage_alter_t;
+
+/* Reserve free space from every block for key_version */
+#define ROW_MERGE_RESERVE_SIZE 4
+
+/* Cluster index read task is mandatory */
+#define COST_READ_CLUSTERED_INDEX            1.0
+
+/* Basic fixed cost to build all type of index */
+#define COST_BUILD_INDEX_STATIC              0.5
+/* Dynamic cost to build all type of index, dynamic cost will be re-distributed based on page count ratio of each index */
+#define COST_BUILD_INDEX_DYNAMIC             0.5
+
+/* Sum of below two must be 1.0 */
+#define PCT_COST_MERGESORT_INDEX                 0.4
+#define PCT_COST_INSERT_INDEX                    0.6
+
+// Forward declaration
+struct ib_sequence_t;
+
+/** @brief Block size for I/O operations in merge sort.
+
+The minimum is srv_page_size, or page_get_free_space_of_empty()
+rounded to a power of 2.
+
+When not creating a PRIMARY KEY that contains column prefixes, this
+can be set as small as srv_page_size / 2. */
+typedef byte	row_merge_block_t;
+
+/** @brief Secondary buffer for I/O operations of merge records.
+
+This buffer is used for writing or reading a record that spans two
+row_merge_block_t.  Thus, it must be able to hold one merge record,
+whose maximum size is the same as the minimum size of
+row_merge_block_t. */
+typedef byte	mrec_buf_t[UNIV_PAGE_SIZE_MAX];
+
+/** @brief Merge record in row_merge_block_t.
+
+The format is the same as a record in ROW_FORMAT=COMPACT with the
+exception that the REC_N_NEW_EXTRA_BYTES are omitted. */
+typedef byte	mrec_t;
+
+/** Merge record in row_merge_buf_t */
+struct mtuple_t {
+	dfield_t*	fields;		/*!< data fields */
+};
+
+/** Buffer for sorting in main memory. */
+struct row_merge_buf_t {
+	mem_heap_t*	heap;		/*!< memory heap where allocated */
+	dict_index_t*	index;		/*!< the index the tuples belong to */
+	ulint		total_size;	/*!< total amount of data bytes */
+	ulint		n_tuples;	/*!< number of data tuples */
+	ulint		max_tuples;	/*!< maximum number of data tuples */
+	mtuple_t*	tuples;		/*!< array of data tuples */
+	mtuple_t*	tmp_tuples;	/*!< temporary copy of tuples,
+					for sorting */
+};
+
+/** Information about temporary files used in merge sort */
+struct merge_file_t {
+	pfs_os_file_t	fd;		/*!< file descriptor */
+	ulint		offset;		/*!< file offset (end of file) */
+	ib_uint64_t	n_rec;		/*!< number of records in the file */
+};
+
+/** Index field definition */
+struct index_field_t {
+	ulint		col_no;		/*!< column offset */
+	ulint		prefix_len;	/*!< column prefix length, or 0
+					if indexing the whole column */
+	bool		is_v_col;	/*!< whether this is a virtual column */
+	bool		descending;	/*!< whether to use DESC order */
+};
+
+/** Definition of an index being created */
+struct index_def_t {
+	const char*	name;		/*!< index name */
+	bool		rebuild;	/*!< whether the table is rebuilt */
+	ulint		ind_type;	/*!< 0, DICT_UNIQUE,
+					or DICT_CLUSTERED */
+	ulint		key_number;	/*!< MySQL key number,
+					or ULINT_UNDEFINED if none */
+	ulint		n_fields;	/*!< number of fields in index */
+	index_field_t*	fields;		/*!< field definitions */
+	st_mysql_ftparser*
+			parser;		/*!< fulltext parser plugin */
+};
+
+/** Structure for reporting duplicate records. */
+struct row_merge_dup_t {
+	dict_index_t*		index;	/*!< index being sorted */
+	struct TABLE*		table;	/*!< MySQL table object */
+	const ulint*		col_map;/*!< mapping of column numbers
+					in table to the rebuilt table
+					(index->table), or NULL if not
+					rebuilding table */
+	ulint			n_dup;	/*!< number of duplicates */
+};
+
+/*************************************************************//**
+Report a duplicate key. */
+void
+row_merge_dup_report(
+/*=================*/
+	row_merge_dup_t*	dup,	/*!< in/out: for reporting duplicates */
+	const dfield_t*		entry)	/*!< in: duplicate index entry */
+	MY_ATTRIBUTE((nonnull));
+
+/** Drop indexes that were created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed.
+@param trx              dictionary transaction
+@param table            table containing the indexes
+@param locked           True if table is locked,
+                        false - may need to do lazy drop
+@param alter_trx        Alter table transaction */
+void
+row_merge_drop_indexes(
+        trx_t*          trx,
+        dict_table_t*   table,
+        bool            locked,
+        const trx_t*    alter_trx=NULL);
+
+/** During recovery, drop recovered index stubs that were created in
+prepare_inplace_alter_table_dict(). */
+void row_merge_drop_temp_indexes();
+
+/** Create temporary merge files in the given paramater path, and if
+UNIV_PFS_IO defined, register the file descriptor with Performance Schema.
+@param[in]	path	location for creating temporary merge files, or NULL
+@return File descriptor */
+pfs_os_file_t
+row_merge_file_create_low(
+	const char*	path)
+	MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************************//**
+Destroy a merge file. And de-register the file from Performance Schema
+if UNIV_PFS_IO is defined. */
+void
+row_merge_file_destroy_low(
+/*=======================*/
+	const pfs_os_file_t&	fd);	/*!< in: merge file descriptor */
+
+/*********************************************************************//**
+Rename an index in the dictionary that was created. The data
+dictionary must have been locked exclusively by the caller, because
+the transaction will not be committed.
+@return DB_SUCCESS if all OK */
+dberr_t
+row_merge_rename_index_to_add(
+/*==========================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	table_id_t	table_id,	/*!< in: table identifier */
+	index_id_t	index_id)	/*!< in: index identifier */
+	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/** Create the index and load in to the dictionary.
+@param[in,out]	table		the index is on this table
+@param[in]	index_def	the index definition
+@param[in]	add_v		new virtual columns added along with add
+				index call
+@return index, or NULL on error */
+dict_index_t*
+row_merge_create_index(
+	dict_table_t*		table,
+	const index_def_t*	index_def,
+	const dict_add_v_col_t*	add_v)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Check if a transaction can use an index.
+@return whether the index can be used by the transaction */
+bool
+row_merge_is_index_usable(
+/*======================*/
+	const trx_t*		trx,	/*!< in: transaction */
+	const dict_index_t*	index)	/*!< in: index to check */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Map from column numbers to column definitions that include
+changes to the collation, when the encoding is compatible with
+the original column and no table rebuild is needed */
+typedef std::map<unsigned, dict_col_t*> col_collations;
+
+/** Build indexes on a table by reading a clustered index, creating a temporary
+file containing index entries, merge sorting these index entries and inserting
+sorted index entries to indexes.
+@param[in]	trx		transaction
+@param[in]	old_table	table where rows are read from
+@param[in]	new_table	table where indexes are created; identical to
+old_table unless creating a PRIMARY KEY
+@param[in]	online		true if creating indexes online
+@param[in]	indexes		indexes to be created
+@param[in]	key_numbers	MySQL key numbers
+@param[in]	n_indexes	size of indexes[]
+@param[in,out]	table		MySQL table, for reporting erroneous key value
+if applicable
+@param[in]	defaults	default values of added, changed columns, or NULL
+@param[in]	col_map		mapping of old column numbers to new ones, or
+NULL if old_table == new_table
+@param[in]	add_autoinc	number of added AUTO_INCREMENT columns, or
+ULINT_UNDEFINED if none is added
+@param[in,out]	sequence	autoinc sequence
+@param[in]	skip_pk_sort	whether the new PRIMARY KEY will follow
+existing order
+@param[in,out]	stage		performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_read_pk() will be called at the beginning of
+this function and it will be passed to other functions for further accounting.
+@param[in]	add_v		new virtual columns added along with indexes
+@param[in]	eval_table	mysql table used to evaluate virtual column
+				value, see innobase_get_computed_value().
+@param[in]	allow_non_null	allow the conversion from null to not-null
+@param[in]	col_collate	columns whose collations changed, or nullptr
+@return DB_SUCCESS or error code */
+dberr_t
+row_merge_build_indexes(
+	trx_t*			trx,
+	dict_table_t*		old_table,
+	dict_table_t*		new_table,
+	bool			online,
+	dict_index_t**		indexes,
+	const ulint*		key_numbers,
+	ulint			n_indexes,
+	struct TABLE*		table,
+	const dtuple_t*		defaults,
+	const ulint*		col_map,
+	ulint			add_autoinc,
+	ib_sequence_t&		sequence,
+	bool			skip_pk_sort,
+	ut_stage_alter_t*	stage,
+	const dict_add_v_col_t*	add_v,
+	struct TABLE*		eval_table,
+	bool			allow_non_null,
+	const col_collations*	col_collate)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Write a buffer to a block.
+@param buf              sorted buffer
+@param block            buffer for writing to file
+@param blob_file        blob file handle for doing bulk insert operation */
+dberr_t row_merge_buf_write(const row_merge_buf_t *buf,
+#ifndef DBUG_OFF
+                            const merge_file_t *of, /*!< output file */
+#endif
+                            row_merge_block_t *block,
+                            merge_file_t *blob_file= nullptr);
+
+/********************************************************************//**
+Sort a buffer. */
+void
+row_merge_buf_sort(
+/*===============*/
+	row_merge_buf_t*	buf,	/*!< in/out: sort buffer */
+	row_merge_dup_t*	dup)	/*!< in/out: reporter of duplicates
+					(NULL if non-unique index) */
+	MY_ATTRIBUTE((nonnull(1)));
+
+/********************************************************************//**
+Write a merge block to the file system.
+@return whether the request was completed successfully
+@retval	false	on error
+@retval	true	on success */
+bool
+row_merge_write(
+	const pfs_os_file_t&	fd,	/*!< in: file descriptor */
+	ulint		offset,	/*!< in: offset where to write,
+				in number of row_merge_block_t elements */
+	const void*	buf,	/*!< in: data */
+	void*		crypt_buf,		/*!< in: crypt buf or NULL */
+	ulint		space)			/*!< in: space id */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/********************************************************************//**
+Empty a sort buffer.
+@return sort buffer */
+row_merge_buf_t*
+row_merge_buf_empty(
+/*================*/
+	row_merge_buf_t*	buf)	/*!< in,own: sort buffer */
+	MY_ATTRIBUTE((warn_unused_result, nonnull));
+
+/** Create a merge file in the given location.
+@param[out]	merge_file	merge file structure
+@param[in]	path		location for creating temporary file, or NULL
+@return file descriptor, or -1 on failure */
+pfs_os_file_t
+row_merge_file_create(
+	merge_file_t*	merge_file,
+	const char*	path)
+	MY_ATTRIBUTE((warn_unused_result, nonnull(1)));
+
+/** Merge disk files.
+@param[in]	trx	transaction
+@param[in]	dup	descriptor of index being created
+@param[in,out]	file	file containing index entries
+@param[in,out]	block	3 buffers
+@param[in,out]	tmpfd	temporary file handle
+@param[in]      update_progress true, if we should update progress status
+@param[in]      pct_progress total progress percent until now
+@param[in]      pct_ocst current progress percent
+@param[in]      crypt_block crypt buf or NULL
+@param[in]      space    space_id
+@param[in,out]	stage	performance schema accounting object, used by
+ALTER TABLE. If not NULL, stage->begin_phase_sort() will be called initially
+and then stage->inc() will be called for each record processed.
+@return DB_SUCCESS or error code */
+dberr_t
+row_merge_sort(
+/*===========*/
+	trx_t*			trx,
+	const row_merge_dup_t*	dup,
+	merge_file_t*		file,
+	row_merge_block_t*	block,
+	pfs_os_file_t*		tmpfd,
+	const bool		update_progress,
+	const double	pct_progress,
+	const double	pct_cost,
+	row_merge_block_t*	crypt_block,
+	ulint			space,
+	ut_stage_alter_t*	stage = NULL)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Allocate a sort buffer.
+@return own: sort buffer */
+row_merge_buf_t*
+row_merge_buf_create(
+/*=================*/
+	dict_index_t*	index)	/*!< in: secondary index */
+	MY_ATTRIBUTE((warn_unused_result, nonnull, malloc));
+
+/*********************************************************************//**
+Deallocate a sort buffer. */
+void
+row_merge_buf_free(
+/*===============*/
+	row_merge_buf_t*	buf)	/*!< in,own: sort buffer to be freed */
+	MY_ATTRIBUTE((nonnull));
+
+/*********************************************************************//**
+Destroy a merge file. */
+void
+row_merge_file_destroy(
+/*===================*/
+	merge_file_t*	merge_file)	/*!< in/out: merge file structure */
+	MY_ATTRIBUTE((nonnull));
+
+/** Read a merge block from the file system.
+@return whether the request was completed successfully */
+bool
+row_merge_read(
+/*===========*/
+	const pfs_os_file_t&	fd,	/*!< in: file descriptor */
+	ulint			offset,	/*!< in: offset where to read
+					in number of row_merge_block_t
+					elements */
+	row_merge_block_t*	buf,	/*!< out: data */
+	row_merge_block_t*	crypt_buf, /*!< in: crypt buf or NULL */
+	ulint			space)	   /*!< in: space id */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/********************************************************************//**
+Read a merge record.
+@return pointer to next record, or NULL on I/O error or end of list */
+const byte*
+row_merge_read_rec(
+/*===============*/
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	mrec_buf_t*		buf,	/*!< in/out: secondary buffer */
+	const byte*		b,	/*!< in: pointer to record */
+	const dict_index_t*	index,	/*!< in: index of the record */
+	const pfs_os_file_t&	fd,	/*!< in: file descriptor */
+	ulint*			foffs,	/*!< in/out: file offset */
+	const mrec_t**		mrec,	/*!< out: pointer to merge record,
+					or NULL on end of list
+					(non-NULL on I/O error) */
+	rec_offs*		offsets,/*!< out: offsets of mrec */
+	row_merge_block_t*	crypt_block, /*!< in: crypt buf or NULL */
+	ulint			space)	   /*!< in: space id */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Buffer for bulk insert */
+class row_merge_bulk_t
+{
+  /** Buffer for each index in the table. main memory
+  buffer for sorting the index */
+  row_merge_buf_t *m_merge_buf;
+  /** Block for IO operation */
+  row_merge_block_t *m_block= nullptr;
+  /** File to store the buffer and used for merge sort */
+  merge_file_t *m_merge_files= nullptr;
+  /** Temporary file to be used for merge sort */
+  pfs_os_file_t m_tmpfd;
+  /** Allocate memory for merge file data structure */
+  ut_allocator<row_merge_block_t> m_alloc;
+  /** Storage for description for the m_alloc */
+  ut_new_pfx_t m_block_pfx;
+  /** Temporary file to store the blob */
+  merge_file_t m_blob_file;
+  /** Storage for description for the crypt_block */
+  ut_new_pfx_t m_crypt_pfx;
+  /** Block for encryption */
+  row_merge_block_t *m_crypt_block= nullptr;
+public:
+  /** Constructor.
+  Create all merge files, merge buffer for all the table indexes
+  expect fts indexes.
+  Create a merge block which is used to write IO operation
+  @param table  table which undergoes bulk insert operation */
+  row_merge_bulk_t(dict_table_t *table);
+
+  /** Destructor.
+  Remove all merge files, merge buffer for all table indexes. */
+  ~row_merge_bulk_t();
+
+  /** Remove all buffer for the table indexes */
+  void remove_all_bulk_buffer();
+
+  /** Clean the merge buffer for the given index number */
+  void clean_bulk_buffer(ulint index_no);
+
+  /** Create the temporary file for the given index number
+  @retval true if temporary file creation went well */
+  bool create_tmp_file(ulint index_no);
+
+  /** Write the merge buffer to the tmp file for the given
+  index number.
+  @param index_no       buffer to be written for the index */
+  dberr_t write_to_tmp_file(ulint index_no);
+
+  /** Add the tuple to the merge buffer for the given index.
+  If the buffer ran out of memory then write the buffer into
+  the temporary file and do insert the tuple again.
+  @param row     tuple to be inserted
+  @param ind     index to be buffered
+  @param trx     bulk transaction */
+  dberr_t bulk_insert_buffered(const dtuple_t &row, const dict_index_t &ind,
+                               trx_t *trx);
+
+  /** Do bulk insert operation into the index tree from
+  buffer or merge file if exists
+  @param index_no  index to be inserted
+  @param trx       bulk transaction */
+  dberr_t write_to_index(ulint index_no, trx_t *trx);
+
+  /** Do bulk insert for the buffered insert for the table.
+  @param table  table which undergoes for bulk insert operation
+  @param trx    bulk transaction */
+  dberr_t write_to_table(dict_table_t *table, trx_t *trx);
+
+  /** Allocate block for writing the buffer into disk */
+  dberr_t alloc_block();
+
+  /** Init temporary files for each index */
+  void init_tmp_file();
+};
diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h
new file mode 100644
index 00000000..878d9c9f
--- /dev/null
+++ b/storage/innobase/include/row0mysql.h
@@ -0,0 +1,841 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0mysql.h
+Interface between Innobase row operations and MySQL.
+Contains also create table and other data dictionary operations.
+
+Created 9/17/2000 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0mysql_h
+#define row0mysql_h
+
+#include "que0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include "btr0types.h"
+#include "lock0types.h"
+#include "fil0fil.h"
+#include "fts0fts.h"
+#include "gis0type.h"
+
+struct row_prebuilt_t;
+class ha_innobase;
+class ha_handler_stats;
+
+/*******************************************************************//**
+Frees the blob heap in prebuilt when no longer needed. */
+void
+row_mysql_prebuilt_free_blob_heap(
+/*==============================*/
+	row_prebuilt_t*	prebuilt);	/*!< in: prebuilt struct of a
+					ha_innobase:: table handle */
+/*******************************************************************//**
+Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row
+format.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+byte*
+row_mysql_store_true_var_len(
+/*=========================*/
+	byte*	dest,	/*!< in: where to store */
+	ulint	len,	/*!< in: length, must fit in two bytes */
+	ulint	lenlen);/*!< in: storage length of len: either 1 or 2 bytes */
+/*******************************************************************//**
+Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and
+returns a pointer to the data.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+const byte*
+row_mysql_read_true_varchar(
+/*========================*/
+	ulint*		len,	/*!< out: variable-length field length */
+	const byte*	field,	/*!< in: field in the MySQL format */
+	ulint		lenlen);/*!< in: storage length of len: either 1
+				or 2 bytes */
+/*******************************************************************//**
+Stores a reference to a BLOB in the MySQL format. */
+void
+row_mysql_store_blob_ref(
+/*=====================*/
+	byte*		dest,	/*!< in: where to store */
+	ulint		col_len,/*!< in: dest buffer size: determines into
+				how many bytes the BLOB length is stored,
+				the space for the length may vary from 1
+				to 4 bytes */
+	const void*	data,	/*!< in: BLOB data; if the value to store
+				is SQL NULL this should be NULL pointer */
+	ulint		len);	/*!< in: BLOB length; if the value to store
+				is SQL NULL this should be 0; remember
+				also to set the NULL bit in the MySQL record
+				header! */
+/*******************************************************************//**
+Reads a reference to a BLOB in the MySQL format.
+@return pointer to BLOB data */
+const byte*
+row_mysql_read_blob_ref(
+/*====================*/
+	ulint*		len,		/*!< out: BLOB length */
+	const byte*	ref,		/*!< in: BLOB reference in the
+					MySQL format */
+	ulint		col_len);	/*!< in: BLOB reference length
+					(not BLOB length) */
+/*******************************************************************//**
+Converts InnoDB geometry data format to MySQL data format. */
+void
+row_mysql_store_geometry(
+/*=====================*/
+	byte*		dest,		/*!< in/out: where to store */
+	ulint		dest_len,	/*!< in: dest buffer size: determines into
+					how many bytes the geometry length is stored,
+					the space for the length may vary from 1
+					to 4 bytes */
+	const byte*	src,		/*!< in: geometry data; if the value to store
+					is SQL NULL this should be NULL pointer */
+	ulint		src_len);	/*!< in: geometry length; if the value to store
+					is SQL NULL this should be 0; remember
+					also to set the NULL bit in the MySQL record
+					header! */
+/**************************************************************//**
+Pad a column with spaces. */
+void
+row_mysql_pad_col(
+/*==============*/
+	ulint	mbminlen,	/*!< in: minimum size of a character,
+				in bytes */
+	byte*	pad,		/*!< out: padded buffer */
+	ulint	len);		/*!< in: number of bytes to pad */
+
+/**************************************************************//**
+Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
+The counterpart of this function is row_sel_field_store_in_mysql_format() in
+row0sel.cc.
+@return up to which byte we used buf in the conversion */
+byte*
+row_mysql_store_col_in_innobase_format(
+/*===================================*/
+	dfield_t*	dfield,		/*!< in/out: dfield where dtype
+					information must be already set when
+					this function is called! */
+	byte*		buf,		/*!< in/out: buffer for a converted
+					integer value; this must be at least
+					col_len long then! NOTE that dfield
+					may also get a pointer to 'buf',
+					therefore do not discard this as long
+					as dfield is used! */
+	ibool		row_format_col,	/*!< TRUE if the mysql_data is from
+					a MySQL row, FALSE if from a MySQL
+					key value;
+					in MySQL, a true VARCHAR storage
+					format differs in a row and in a
+					key value: in a key value the length
+					is always stored in 2 bytes! */
+	const byte*	mysql_data,	/*!< in: MySQL column value, not
+					SQL NULL; NOTE that dfield may also
+					get a pointer to mysql_data,
+					therefore do not discard this as long
+					as dfield is used! */
+	ulint		col_len,	/*!< in: MySQL column length; NOTE that
+					this is the storage length of the
+					column in the MySQL format row, not
+					necessarily the length of the actual
+					payload data; if the column is a true
+					VARCHAR then this is irrelevant */
+	ulint		comp);		/*!< in: nonzero=compact format */
+/****************************************************************//**
+Handles user errors and lock waits detected by the database engine.
+@return true if it was a lock wait and we should continue running the
+query thread */
+bool
+row_mysql_handle_errors(
+/*====================*/
+	dberr_t*	new_err,/*!< out: possible new error encountered in
+				rollback, or the old error which was
+				during the function entry */
+	trx_t*		trx,	/*!< in: transaction */
+	que_thr_t*	thr,	/*!< in: query thread, or NULL */
+	trx_savept_t*	savept)	/*!< in: savepoint, or NULL */
+	MY_ATTRIBUTE((nonnull(1,2)));
+/********************************************************************//**
+Create a prebuilt struct for a MySQL table handle.
+@return own: a prebuilt struct */
+row_prebuilt_t*
+row_create_prebuilt(
+/*================*/
+	dict_table_t*	table,		/*!< in: Innobase table handle */
+	ulint		mysql_row_len);	/*!< in: length in bytes of a row in
+					the MySQL format */
+/** Free a prebuilt struct for a TABLE handle. */
+void row_prebuilt_free(row_prebuilt_t *prebuilt);
+/*********************************************************************//**
+Updates the transaction pointers in query graphs stored in the prebuilt
+struct. */
+void
+row_update_prebuilt_trx(
+/*====================*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt struct
+					in MySQL handle */
+	trx_t*		trx);		/*!< in: transaction handle */
+
+/*********************************************************************//**
+Sets an AUTO_INC type lock on the table mentioned in prebuilt. The
+AUTO_INC lock gives exclusive access to the auto-inc counter of the
+table. The lock is reserved only for the duration of an SQL statement.
+It is not compatible with another AUTO_INC or exclusive lock on the
+table.
+@return error code or DB_SUCCESS */
+dberr_t
+row_lock_table_autoinc_for_mysql(
+/*=============================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in the MySQL
+					table handle */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Lock a table.
+@param[in,out]	prebuilt	table handle
+@return error code or DB_SUCCESS */
+dberr_t
+row_lock_table(row_prebuilt_t* prebuilt);
+
+/** System Versioning: row_insert_for_mysql() modes */
+enum ins_mode_t {
+	/* plain row (without versioning) */
+	ROW_INS_NORMAL = 0,
+	/* row_start = TRX_ID, row_end = MAX */
+	ROW_INS_VERSIONED,
+	/* row_end = TRX_ID */
+	ROW_INS_HISTORICAL
+};
+
+/** Does an insert for MySQL.
+@param[in]	mysql_rec	row in the MySQL format
+@param[in,out]	prebuilt	prebuilt struct in MySQL handle
+@param[in]	ins_mode	what row type we're inserting
+@return error code or DB_SUCCESS*/
+dberr_t
+row_insert_for_mysql(
+	const byte*		mysql_rec,
+	row_prebuilt_t*		prebuilt,
+	ins_mode_t		ins_mode)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Builds a dummy query graph used in selects. */
+void
+row_prebuild_sel_graph(
+/*===================*/
+	row_prebuilt_t*	prebuilt);	/*!< in: prebuilt struct in MySQL
+					handle */
+/*********************************************************************//**
+Gets pointer to a prebuilt update vector used in updates. If the update
+graph has not yet been built in the prebuilt struct, then this function
+first builds it.
+@return prebuilt update vector */
+upd_t*
+row_get_prebuilt_update_vector(
+/*===========================*/
+	row_prebuilt_t*	prebuilt);	/*!< in: prebuilt struct in MySQL
+					handle */
+/** Does an update or delete of a row for MySQL.
+@param[in,out]	prebuilt	prebuilt struct in MySQL handle
+@return error code or DB_SUCCESS */
+dberr_t
+row_update_for_mysql(
+	row_prebuilt_t*		prebuilt)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** This can only be used when the current transaction is at
+READ COMMITTED or READ UNCOMMITTED isolation level.
+Before calling this function row_search_mvcc() must have
+initialized prebuilt->new_rec_locks to store the information which new
+record locks really were set. This function removes a newly set
+clustered index record lock under prebuilt->pcur or
+prebuilt->clust_pcur.  Thus, this implements a 'mini-rollback' that
+releases the latest clustered index record lock we set.
+@param[in,out]	prebuilt		prebuilt struct in MySQL handle
+@param[in]	has_latches_on_recs	TRUE if called so that we have the
+					latches on the records under pcur
+					and clust_pcur, and we do not need
+					to reposition the cursors. */
+void
+row_unlock_for_mysql(
+	row_prebuilt_t*	prebuilt,
+	ibool		has_latches_on_recs);
+
+/*********************************************************************//**
+Creates an query graph node of 'update' type to be used in the MySQL
+interface.
+@return own: update node */
+upd_node_t*
+row_create_update_node_for_mysql(
+/*=============================*/
+	dict_table_t*	table,	/*!< in: table to update */
+	mem_heap_t*	heap);	/*!< in: mem heap from which allocated */
+
+/**********************************************************************//**
+Does a cascaded delete or set null in a foreign key operation.
+@return error code or DB_SUCCESS */
+dberr_t
+row_update_cascade_for_mysql(
+/*=========================*/
+        que_thr_t*      thr,    /*!< in: query thread */
+        upd_node_t*     node,   /*!< in: update node used in the cascade
+                                or set null operation */
+        dict_table_t*   table)  /*!< in: table where we do the operation */
+        MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Lock the data dictionary cache exclusively. */
+#define row_mysql_lock_data_dictionary(trx)			\
+	do {							\
+		ut_ad(!trx->dict_operation_lock_mode);		\
+		dict_sys.lock(SRW_LOCK_CALL);			\
+		trx->dict_operation_lock_mode = true;		\
+	} while (0)
+
+/** Unlock the data dictionary. */
+#define row_mysql_unlock_data_dictionary(trx)			\
+	do {							\
+		ut_ad(!lock_trx_has_sys_table_locks(trx));	\
+		ut_ad(trx->dict_operation_lock_mode);		\
+		trx->dict_operation_lock_mode = false;		\
+		dict_sys.unlock();				\
+	} while (0)
+
+/*********************************************************************//**
+Creates a table for MySQL. On failure the transaction will be rolled back
+and the 'table' object will be freed.
+@return error code or DB_SUCCESS */
+dberr_t
+row_create_table_for_mysql(
+/*=======================*/
+	dict_table_t*	table,	/*!< in, own: table definition
+				(will be freed, or on DB_SUCCESS
+				added to the data dictionary cache) */
+	trx_t*		trx)	/*!< in/out: transaction */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Create an index when creating a table.
+On failure, the caller must drop the table!
+@return error number or DB_SUCCESS */
+dberr_t
+row_create_index_for_mysql(
+/*=======================*/
+	dict_index_t*	index,		/*!< in, own: index definition
+					(will be freed) */
+	trx_t*		trx,		/*!< in: transaction handle */
+	const ulint*	field_lengths,	/*!< in: if not NULL, must contain
+					dict_index_get_n_fields(index)
+					actual field lengths for the
+					index columns, which are
+					then checked for not being too
+					large. */
+	fil_encryption_t mode,	/*!< in: encryption mode */
+	uint32_t	key_id)	/*!< in: encryption key_id */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Discards the tablespace of a table which stored in an .ibd file. Discarding
+means that this function deletes the .ibd file and assigns a new table id for
+the table. Also the file_unreadable flag is set.
+@return error code or DB_SUCCESS */
+dberr_t row_discard_tablespace_for_mysql(dict_table_t *table, trx_t *trx)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return error code or DB_SUCCESS */
+dberr_t
+row_import_tablespace_for_mysql(
+/*============================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL */
+        MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Renames a table for MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+row_rename_table_for_mysql(
+/*=======================*/
+	const char*	old_name,	/*!< in: old table name */
+	const char*	new_name,	/*!< in: new table name */
+	trx_t*		trx,		/*!< in/out: transaction */
+	bool		use_fk)		/*!< in: whether to parse and enforce
+					FOREIGN KEY constraints */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/* A struct describing a place for an individual column in the MySQL
+row format which is presented to the table handler in ha_innobase.
+This template struct is used to speed up row transformations between
+Innobase and MySQL. */
+
+struct mysql_row_templ_t {
+	ulint	col_no;			/*!< column number of the column */
+	ulint	rec_field_no;		/*!< field number of the column in an
+					Innobase record in the current index;
+					not defined if template_type is
+					ROW_MYSQL_WHOLE_ROW */
+	ibool	rec_field_is_prefix;	/* is this field in a prefix index? */
+	ulint	rec_prefix_field_no;	/* record field, even if just a
+					prefix; same as rec_field_no when not a
+					prefix, otherwise rec_field_no is
+					ULINT_UNDEFINED but this is the true
+					field number*/
+	ulint	clust_rec_field_no;	/*!< field number of the column in an
+					Innobase record in the clustered index;
+					not defined if template_type is
+					ROW_MYSQL_WHOLE_ROW */
+	ulint	icp_rec_field_no;	/*!< field number of the column in an
+					Innobase record in the current index;
+					not defined unless
+					index condition pushdown is used */
+	ulint	mysql_col_offset;	/*!< offset of the column in the MySQL
+					row format */
+	ulint	mysql_col_len;		/*!< length of the column in the MySQL
+					row format */
+	ulint	mysql_null_byte_offset;	/*!< MySQL NULL bit byte offset in a
+					MySQL record */
+	ulint	mysql_null_bit_mask;	/*!< bit mask to get the NULL bit,
+					zero if column cannot be NULL */
+	ulint	type;			/*!< column type in Innobase mtype
+					numbers DATA_CHAR... */
+	ulint	mysql_type;		/*!< MySQL type code; this is always
+					< 256 */
+	ulint	mysql_length_bytes;	/*!< if mysql_type
+					== DATA_MYSQL_TRUE_VARCHAR, this tells
+					whether we should use 1 or 2 bytes to
+					store the MySQL true VARCHAR data
+					length at the start of row in the MySQL
+					format (NOTE that the MySQL key value
+					format always uses 2 bytes for the data
+					len) */
+	ulint	charset;		/*!< MySQL charset-collation code
+					of the column, or zero */
+	ulint	mbminlen;		/*!< minimum length of a char, in bytes,
+					or zero if not a char type */
+	ulint	mbmaxlen;		/*!< maximum length of a char, in bytes,
+					or zero if not a char type */
+	ulint	is_unsigned;		/*!< if a column type is an integer
+					type and this field is != 0, then
+					it is an unsigned integer type */
+	ulint	is_virtual;		/*!< if a column is a virtual column */
+};
+
+#define MYSQL_FETCH_CACHE_SIZE		8
+/* After fetching this many rows, we start caching them in fetch_cache */
+#define MYSQL_FETCH_CACHE_THRESHOLD	4
+
+#define ROW_PREBUILT_ALLOCATED	78540783
+#define ROW_PREBUILT_FREED	26423527
+
+/** A struct for (sometimes lazily) prebuilt structures in an Innobase table
+handle used within MySQL; these are used to save CPU time. */
+
+struct row_prebuilt_t {
+	ulint		magic_n;	/*!< this magic number is set to
+					ROW_PREBUILT_ALLOCATED when created,
+					or ROW_PREBUILT_FREED when the
+					struct has been freed */
+	dict_table_t*	table;		/*!< Innobase table handle */
+	dict_index_t*	index;		/*!< current index for a search, if
+					any */
+	trx_t*		trx;		/*!< current transaction handle */
+	unsigned	sql_stat_start:1;/*!< TRUE when we start processing of
+					an SQL statement: we may have to set
+					an intention lock on the table,
+					create a consistent read view etc. */
+	unsigned	clust_index_was_generated:1;
+					/*!< if the user did not define a
+					primary key in MySQL, then Innobase
+					automatically generated a clustered
+					index where the ordering column is
+					the row id: in this case this flag
+					is set to TRUE */
+	unsigned	index_usable:1;	/*!< caches the value of
+					row_merge_is_index_usable(trx,index) */
+	unsigned	read_just_key:1;/*!< set to 1 when MySQL calls
+					ha_innobase::extra with the
+					argument HA_EXTRA_KEYREAD; it is enough
+					to read just columns defined in
+					the index (i.e., no read of the
+					clustered index record necessary) */
+	unsigned	used_in_HANDLER:1;/*!< TRUE if we have been using this
+					handle in a MySQL HANDLER low level
+					index cursor command: then we must
+					store the pcur position even in a
+					unique search from a clustered index,
+					because HANDLER allows NEXT and PREV
+					in such a situation */
+	unsigned	template_type:2;/*!< ROW_MYSQL_WHOLE_ROW,
+					ROW_MYSQL_REC_FIELDS,
+					ROW_MYSQL_DUMMY_TEMPLATE, or
+					ROW_MYSQL_NO_TEMPLATE */
+	unsigned	n_template:10;	/*!< number of elements in the
+					template */
+	unsigned	null_bitmap_len:10;/*!< number of bytes in the SQL NULL
+					bitmap at the start of a row in the
+					MySQL format */
+	unsigned	need_to_access_clustered:1; /*!< if we are fetching
+					columns through a secondary index
+					and at least one column is not in
+					the secondary index, then this is
+					set to TRUE; note that sometimes this
+					is set but we later optimize out the
+					clustered index lookup */
+	unsigned	templ_contains_blob:1;/*!< TRUE if the template contains
+					a column with DATA_LARGE_MTYPE(
+					get_innobase_type_from_mysql_type())
+					is TRUE;
+					not to be confused with InnoDB
+					externally stored columns
+					(VARCHAR can be off-page too) */
+	unsigned	versioned_write:1;/*!< whether this is
+					a versioned write */
+	mysql_row_templ_t* mysql_template;/*!< template used to transform
+					rows fast between MySQL and Innobase
+					formats; memory for this template
+					is not allocated from 'heap' */
+	mem_heap_t*	heap;		/*!< memory heap from which
+					these auxiliary structures are
+					allocated when needed */
+	ins_node_t*	ins_node;	/*!< Innobase SQL insert node
+					used to perform inserts
+					to the table */
+	byte*		ins_upd_rec_buff;/*!< buffer for storing data converted
+					to the Innobase format from the MySQL
+					format */
+	const byte*	default_rec;	/*!< the default values of all columns
+					(a "default row") in MySQL format */
+	ulint		hint_need_to_fetch_extra_cols;
+					/*!< normally this is set to 0; if this
+					is set to ROW_RETRIEVE_PRIMARY_KEY,
+					then we should at least retrieve all
+					columns in the primary key; if this
+					is set to ROW_RETRIEVE_ALL_COLS, then
+					we must retrieve all columns in the
+					key (if read_just_key == 1), or all
+					columns in the table */
+	upd_node_t*	upd_node;	/*!< Innobase SQL update node used
+					to perform updates and deletes */
+	trx_id_t	trx_id;		/*!< The table->def_trx_id when
+					ins_graph was built */
+	que_fork_t*	ins_graph;	/*!< Innobase SQL query graph used
+					in inserts. Will be rebuilt on
+					trx_id or n_indexes mismatch. */
+	que_fork_t*	upd_graph;	/*!< Innobase SQL query graph used
+					in updates or deletes */
+	btr_pcur_t*	pcur;		/*!< persistent cursor used in selects
+					and updates */
+	btr_pcur_t*	clust_pcur;	/*!< persistent cursor used in
+					some selects and updates */
+	que_fork_t*	sel_graph;	/*!< dummy query graph used in
+					selects */
+	dtuple_t*	search_tuple;	/*!< prebuilt dtuple used in selects */
+	byte		row_id[DATA_ROW_ID_LEN];
+					/*!< if the clustered index was
+					generated, the row id of the
+					last row fetched is stored
+					here */
+	doc_id_t	fts_doc_id;	/* if the table has an FTS index on
+					it then we fetch the doc_id.
+					FTS-FIXME: Currently we fetch it always
+					but in the future we must only fetch
+					it when FTS columns are being
+					updated */
+	dtuple_t*	clust_ref;	/*!< prebuilt dtuple used in
+					sel/upd/del */
+	lock_mode	select_lock_type;/*!< LOCK_NONE, LOCK_S, or LOCK_X */
+	bool		skip_locked;	/*!< TL_{READ,WRITE}_SKIP_LOCKED */
+	lock_mode	stored_select_lock_type;/*!< this field is used to
+					remember the original select_lock_type
+					that was decided in ha_innodb.cc,
+					::store_lock(), ::external_lock(),
+					etc. */
+	ulint		row_read_type;	/*!< ROW_READ_WITH_LOCKS if row locks
+					should be the obtained for records
+					under an UPDATE or DELETE cursor.
+					At READ UNCOMMITTED or
+					READ COMMITTED isolation level,
+					this can be set to
+					ROW_READ_TRY_SEMI_CONSISTENT, so that
+					if the row under an UPDATE or DELETE
+					cursor was locked by another
+					transaction, InnoDB will resort
+					to reading the last committed value
+					('semi-consistent read').  Then,
+					this field will be set to
+					ROW_READ_DID_SEMI_CONSISTENT to
+					indicate that.	If the row does not
+					match the WHERE condition, MySQL will
+					invoke handler::unlock_row() to
+					clear the flag back to
+					ROW_READ_TRY_SEMI_CONSISTENT and
+					to simply skip the row.	 If
+					the row matches, the next call to
+					row_search_mvcc() will lock
+					the row.
+					This eliminates lock waits in some
+					cases; note that this breaks
+					serializability. */
+	ulint		new_rec_locks;	/*!< normally 0; if
+					the session is using READ
+					COMMITTED or READ UNCOMMITTED
+					isolation level, set in
+					row_search_mvcc() if we set a new
+					record lock on the secondary
+					or clustered index; this is
+					used in row_unlock_for_mysql()
+					when releasing the lock under
+					the cursor if we determine
+					after retrieving the row that
+					it does not need to be locked
+					('mini-rollback') */
+	ulint		mysql_prefix_len;/*!< byte offset of the end of
+					the last requested column */
+	ulint		mysql_row_len;	/*!< length in bytes of a row in the
+					MySQL format */
+	ulint		n_rows_fetched;	/*!< number of rows fetched after
+					positioning the current cursor */
+	ulint		fetch_direction;/*!< ROW_SEL_NEXT or ROW_SEL_PREV */
+	byte*		fetch_cache[MYSQL_FETCH_CACHE_SIZE];
+					/*!< a cache for fetched rows if we
+					fetch many rows from the same cursor:
+					it saves CPU time to fetch them in a
+					batch; we reserve mysql_row_len
+					bytes for each such row; these
+					pointers point 4 bytes past the
+					allocated mem buf start, because
+					there is a 4 byte magic number at the
+					start and at the end */
+	bool		keep_other_fields_on_keyread; /*!< when using fetch
+					cache with HA_EXTRA_KEYREAD, don't
+					overwrite other fields in mysql row
+					row buffer.*/
+	ulint		fetch_cache_first;/*!< position of the first not yet
+					fetched row in fetch_cache */
+	ulint		n_fetch_cached;	/*!< number of not yet fetched rows
+					in fetch_cache */
+	mem_heap_t*	blob_heap;	/*!< in SELECTS BLOB fields are copied
+					to this heap */
+	mem_heap_t*	old_vers_heap;	/*!< memory heap where a previous
+					version is built in consistent read */
+	bool		in_fts_query;	/*!< Whether we are in a FTS query */
+	bool		fts_doc_id_in_read_set; /*!< true if table has externally
+					defined FTS_DOC_ID coulmn. */
+	/*----------------------*/
+	ulonglong	autoinc_last_value;
+					/*!< last value of AUTO-INC interval */
+	ulonglong	autoinc_increment;/*!< The increment step of the auto
+					increment column. Value must be
+					greater than or equal to 1. Required to
+					calculate the next value */
+	ulonglong	autoinc_offset; /*!< The offset passed to
+					get_auto_increment() by MySQL. Required
+					to calculate the next value */
+	dberr_t		autoinc_error;	/*!< The actual error code encountered
+					while trying to init or read the
+					autoinc value from the table. We
+					store it here so that we can return
+					it to MySQL */
+	/*----------------------*/
+
+	/** Argument of handler_rowid_filter_check(),
+	or NULL if no PRIMARY KEY filter is pushed */
+	ha_innobase*	pk_filter;
+
+	/** Argument to handler_index_cond_check(),
+	or NULL if no index condition pushdown (ICP) is used. */
+	ha_innobase*	idx_cond;
+	ulint		idx_cond_n_cols;/*!< Number of fields in idx_cond_cols.
+					0 if and only if idx_cond == NULL. */
+	/*----------------------*/
+
+	/*----------------------*/
+	rtr_info_t*	rtr_info;	/*!< R-tree Search Info */
+	/*----------------------*/
+
+	ulint		magic_n2;	/*!< this should be the same as
+					magic_n */
+
+	byte*		srch_key_val1;  /*!< buffer used in converting
+					search key values from MySQL format
+					to InnoDB format.*/
+	byte*		srch_key_val2;  /*!< buffer used in converting
+					search key values from MySQL format
+					to InnoDB format.*/
+	uint		srch_key_val_len; /*!< Size of search key */
+	/** The MySQL table object */
+	TABLE*		m_mysql_table;
+
+	/** Get template by dict_table_t::cols[] number */
+	const mysql_row_templ_t* get_template_by_col(ulint col) const
+	{
+		ut_ad(col < n_template);
+		ut_ad(mysql_template);
+		for (ulint i = col; i < n_template; ++i) {
+			const mysql_row_templ_t* templ = &mysql_template[i];
+			if (!templ->is_virtual && templ->col_no == col) {
+				return templ;
+			}
+		}
+		return NULL;
+	}
+};
+
+/** Callback for row_mysql_sys_index_iterate() */
+struct SysIndexCallback {
+	virtual ~SysIndexCallback() = default;
+
+	/** Callback method
+	@param mtr current mini transaction
+	@param pcur persistent cursor. */
+	virtual void operator()(mtr_t* mtr, btr_pcur_t* pcur) throw() = 0;
+};
+
+
+/** Storage for calculating virtual columns */
+
+class String;
+struct VCOL_STORAGE
+{
+	TABLE *maria_table;
+	byte *innobase_record;
+	byte *maria_record;
+	String *blob_value_storage;
+	VCOL_STORAGE(): maria_table(NULL), innobase_record(NULL),
+		maria_record(NULL),  blob_value_storage(NULL) {}
+};
+
+/**
+   Allocate a heap and record for calculating virtual fields
+   Used mainly for virtual fields in indexes
+
+@param[in]	thd		MariaDB THD
+@param[in]	index		Index in use
+@param[out]	heap		Heap that holds temporary row
+@param[in,out]	mysql_table	MariaDB table
+@param[out]	rec		Pointer to allocated MariaDB record
+@param[out]	storage		Internal storage for blobs etc
+
+@return		FALSE ok
+@return		TRUE  malloc failure
+*/
+
+bool innobase_allocate_row_for_vcol(THD *thd,
+				    const dict_index_t* index,
+				    mem_heap_t**  heap,
+				    TABLE**	  table,
+				    VCOL_STORAGE* storage);
+
+/** Free memory allocated by innobase_allocate_row_for_vcol() */
+void innobase_free_row_for_vcol(VCOL_STORAGE *storage);
+
+class ib_vcol_row
+{
+  VCOL_STORAGE storage;
+public:
+  mem_heap_t *heap;
+
+  ib_vcol_row(mem_heap_t *heap) : heap(heap) {}
+
+  byte *record(THD *thd, const dict_index_t *index, TABLE **table)
+  {
+    if (!storage.innobase_record &&
+        !innobase_allocate_row_for_vcol(thd, index, &heap, table, &storage))
+      return nullptr;
+    return storage.innobase_record;
+  }
+
+  ~ib_vcol_row()
+  {
+    if (heap)
+    {
+      if (storage.innobase_record)
+        innobase_free_row_for_vcol(&storage);
+      mem_heap_free(heap);
+    }
+  }
+};
+
+/** Report virtual value computation failure in ib::error
+@param[in]    row    the data row
+*/
+ATTRIBUTE_COLD
+void innobase_report_computed_value_failed(dtuple_t *row);
+
+/** Get the computed value by supplying the base column values.
+@param[in,out]	row		the data row
+@param[in]	col		virtual column
+@param[in]	index		index on the virtual column
+@param[in,out]	local_heap	heap memory for processing large data etc.
+@param[in,out]	heap		memory heap that copies the actual index row
+@param[in]	ifield		index field
+@param[in]	thd		connection handle
+@param[in,out]	mysql_table	MariaDB table handle
+@param[in,out]	mysql_rec	MariaDB record buffer
+@param[in]	old_table	during ALTER TABLE, this is the old table
+				or NULL.
+@param[in]	update	update vector for the parent row
+@param[in]	ignore_warnings	ignore warnings during calculation. Usually
+				means that a calculation is internal and
+				should have no side effects.
+@return the field filled with computed value */
+dfield_t*
+innobase_get_computed_value(
+	dtuple_t*		row,
+	const dict_v_col_t*	col,
+	const dict_index_t*	index,
+	mem_heap_t**		local_heap,
+	mem_heap_t*		heap,
+	const dict_field_t*	ifield,
+	THD*			thd,
+	TABLE*			mysql_table,
+	byte*			mysql_rec,
+	const dict_table_t*	old_table=NULL,
+	const upd_t*		update=NULL,
+	bool			ignore_warnings=false);
+
+/** Change dbname and table name in table->vc_templ.
+@param[in,out]	table	the table whose virtual column template
+dbname and tbname to be renamed. */
+void
+innobase_rename_vc_templ(
+	dict_table_t*	table);
+
+#define ROW_PREBUILT_FETCH_MAGIC_N	465765687
+
+#define ROW_MYSQL_WHOLE_ROW	0
+#define ROW_MYSQL_REC_FIELDS	1
+#define ROW_MYSQL_NO_TEMPLATE	2
+#define ROW_MYSQL_DUMMY_TEMPLATE 3	/* dummy template used in
+					row_check_index() */
+
+/* Values for hint_need_to_fetch_extra_cols */
+#define ROW_RETRIEVE_PRIMARY_KEY	1
+#define ROW_RETRIEVE_ALL_COLS		2
+
+/* Values for row_read_type */
+#define ROW_READ_WITH_LOCKS		0
+#define ROW_READ_TRY_SEMI_CONSISTENT	1
+#define ROW_READ_DID_SEMI_CONSISTENT	2
+
+#endif /* row0mysql.h */
diff --git a/storage/innobase/include/row0purge.h b/storage/innobase/include/row0purge.h
new file mode 100644
index 00000000..1daf4d4a
--- /dev/null
+++ b/storage/innobase/include/row0purge.h
@@ -0,0 +1,149 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0purge.h
+Purge obsolete records
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "que0types.h"
+#include "btr0types.h"
+#include "btr0pcur.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include "row0mysql.h"
+#include "mysqld.h"
+#include <queue>
+#include <unordered_map>
+
+class MDL_ticket;
+/** Determines if it is possible to remove a secondary index entry.
+Removal is possible if the secondary index entry does not refer to any
+not delete marked version of a clustered index record where DB_TRX_ID
+is newer than the purge view.
+
+NOTE: This function should only be called by the purge thread, only
+while holding a latch on the leaf page of the secondary index entry
+(or keeping the buffer pool watch on the page).  It is possible that
+this function first returns true and then false, if a user transaction
+inserts a record that the secondary index entry would refer to.
+However, in that case, the user transaction would also re-insert the
+secondary index entry after purge has removed it and released the leaf
+page latch.
+@param[in,out]	node		row purge node
+@param[in]	index		secondary index
+@param[in]	entry		secondary index entry
+@param[in,out]	sec_pcur	secondary index cursor or NULL
+				if it is called for purge buffering
+				operation.
+@param[in,out]	sec_mtr		mini-transaction which holds
+				secondary index entry or NULL if it is
+				called for purge buffering operation.
+@param[in]	is_tree		true=pessimistic purge,
+				false=optimistic (leaf-page only)
+@return true if the secondary index record can be purged */
+bool
+row_purge_poss_sec(
+	purge_node_t*	node,
+	dict_index_t*	index,
+	const dtuple_t*	entry,
+	btr_pcur_t*	sec_pcur=NULL,
+	mtr_t*		sec_mtr=NULL,
+	bool		is_tree=false);
+
+/***************************************************************
+Does the purge operation.
+@return query thread to run next */
+que_thr_t*
+row_purge_step(
+/*===========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Purge worker context */
+struct purge_node_t
+{
+  /** node type: QUE_NODE_PURGE */
+  que_common_t common;
+
+  /** DB_TRX_ID of the undo log record */
+  trx_id_t trx_id;
+  /** DB_ROLL_PTR pointing to undo log record */
+  roll_ptr_t roll_ptr;
+
+  /** undo number of the record */
+  undo_no_t undo_no;
+
+  /** record type: TRX_UNDO_INSERT_REC, ... */
+  byte rec_type;
+  /** compiler analysis info of an update */
+  byte cmpl_info;
+  /** whether the clustered index record determined by ref was found
+  in the clustered index of the table, and we were able to position
+  pcur on it */
+  bool found_clust;
+#ifdef UNIV_DEBUG
+  /** whether the operation is in progress */
+  bool in_progress= false;
+#endif
+  /** table where purge is done */
+  dict_table_t *table= nullptr;
+  /** update vector for a clustered index record */
+  upd_t *update;
+  /** row reference to the next row to handle, or nullptr */
+  const dtuple_t *ref;
+  /** nullptr, or a deep copy of the indexed fields of the row to handle */
+  dtuple_t *row;
+  /** nullptr, or the next index of table whose record should be handled */
+  dict_index_t *index;
+  /** memory heap used as auxiliary storage; must be emptied between rows */
+  mem_heap_t *heap;
+  /** persistent cursor to the clustered index record */
+  btr_pcur_t pcur;
+
+  /** Undo recs to purge */
+  std::queue<trx_purge_rec_t> undo_recs;
+
+  /** map of table identifiers to table handles and meta-data locks */
+  std::unordered_map<table_id_t, std::pair<dict_table_t*,MDL_ticket*>> tables;
+
+  /** Constructor */
+  explicit purge_node_t(que_thr_t *parent) :
+    common(QUE_NODE_PURGE, parent), heap(mem_heap_create(256)),
+    tables(TRX_PURGE_TABLE_BUCKETS) {}
+
+#ifdef UNIV_DEBUG
+  /** Validate the persistent cursor. The purge node has two references
+  to the clustered index record: ref and pcur, which must match
+  each other if found_clust.
+  @return whether pcur is consistent with ref */
+  bool validate_pcur();
+#endif
+
+  /** Start processing an undo log record. */
+  inline void start();
+
+  /** Reset the state at end
+  @return the query graph parent */
+  inline que_node_t *end(THD *);
+};
diff --git a/storage/innobase/include/row0quiesce.h b/storage/innobase/include/row0quiesce.h
new file mode 100644
index 00000000..b05b7666
--- /dev/null
+++ b/storage/innobase/include/row0quiesce.h
@@ -0,0 +1,67 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0quiesce.h
+
+Header file for tablespace quiesce functions.
+
+Created 2012-02-08 by Sunny Bains
+*******************************************************/
+
+#ifndef row0quiesce_h
+#define row0quiesce_h
+
+#include "dict0types.h"
+
+struct trx_t;
+
+/** The version number of the export meta-data text file. */
+#define IB_EXPORT_CFG_VERSION_V1	0x1UL
+
+/*********************************************************************//**
+Quiesce the tablespace that the table resides in. */
+void
+row_quiesce_table_start(
+/*====================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	trx_t*		trx)		/*!< in/out: transaction/session */
+        MY_ATTRIBUTE((nonnull));
+
+/*********************************************************************//**
+Set a table's quiesce state.
+@return DB_SUCCESS or errro code. */
+dberr_t
+row_quiesce_set_state(
+/*==================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	ib_quiesce_t	state,		/*!< in: quiesce state to set */
+	trx_t*		trx)		/*!< in/out: transaction */
+        MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Cleanup after table quiesce. */
+void
+row_quiesce_table_complete(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	trx_t*		trx)		/*!< in/out: transaction/session */
+        MY_ATTRIBUTE((nonnull));
+
+#endif /* row0quiesce_h */
diff --git a/storage/innobase/include/row0row.h b/storage/innobase/include/row0row.h
new file mode 100644
index 00000000..a1350740
--- /dev/null
+++ b/storage/innobase/include/row0row.h
@@ -0,0 +1,431 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0row.h
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0row_h
+#define row0row_h
+
+#include "que0types.h"
+#include "ibuf0ibuf.h"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "rem0types.h"
+#include "row0types.h"
+#include "btr0types.h"
+
+/*********************************************************************//**
+Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of
+a clustered index record.
+@return offset of DATA_TRX_ID */
+UNIV_INLINE
+ulint
+row_get_trx_id_offset(
+/*==================*/
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_offs*		offsets)/*!< in: record offsets */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Reads the trx id field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+trx_id_t
+row_get_rec_trx_id(
+/*===============*/
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_offs*		offsets)/*!< in: rec_get_offsets(rec, index) */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Reads the roll pointer field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+roll_ptr_t
+row_get_rec_roll_ptr(
+/*=================*/
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_offs*		offsets)/*!< in: rec_get_offsets(rec, index) */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/* Flags for row build type. */
+#define ROW_BUILD_NORMAL	0	/*!< build index row */
+#define ROW_BUILD_FOR_PURGE	1	/*!< build row for purge. */
+#define ROW_BUILD_FOR_UNDO	2	/*!< build row for undo. */
+#define ROW_BUILD_FOR_INSERT	3	/*!< build row for insert. */
+
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged
+@retval NULL if the externally stored columns in the clustered index record
+are unavailable and ext != NULL, or row is missing some needed columns. */
+dtuple_t*
+row_build_index_entry_low(
+/*======================*/
+	const dtuple_t*		row,	/*!< in: row which should be
+					inserted or purged */
+	const row_ext_t*	ext,	/*!< in: externally stored column
+					prefixes, or NULL */
+	const dict_index_t*	index,	/*!< in: index on the table */
+	mem_heap_t*		heap,	/*!< in,out: memory heap from which
+					the memory for the index entry
+					is allocated */
+	ulint			flag)	/*!< in: ROW_BUILD_NORMAL,
+					ROW_BUILD_FOR_PURGE
+                                        or ROW_BUILD_FOR_UNDO */
+	MY_ATTRIBUTE((warn_unused_result, nonnull(1,3,4)));
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged, or NULL if the
+externally stored columns in the clustered index record are
+unavailable and ext != NULL */
+UNIV_INLINE
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+	const dtuple_t*		row,	/*!< in: row which should be
+					inserted or purged */
+	const row_ext_t*	ext,	/*!< in: externally stored column
+					prefixes, or NULL */
+	const dict_index_t*	index,	/*!< in: index on the table */
+	mem_heap_t*		heap)	/*!< in,out: memory heap from which
+					the memory for the index entry
+					is allocated */
+	MY_ATTRIBUTE((warn_unused_result, nonnull(1,3,4)));
+/*******************************************************************//**
+An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index.
+@return own: row built; see the NOTE below! */
+dtuple_t*
+row_build(
+/*======*/
+	ulint			type,	/*!< in: ROW_COPY_POINTERS or
+					ROW_COPY_DATA; the latter
+					copies also the data fields to
+					heap while the first only
+					places pointers to data fields
+					on the index page, and thus is
+					more efficient */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_t*		rec,	/*!< in: record in the clustered
+					index; NOTE: in the case
+					ROW_COPY_POINTERS the data
+					fields in the row will point
+					directly into this record,
+					therefore, the buffer page of
+					this record must be at least
+					s-latched and the latch held
+					as long as the row dtuple is used! */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec,index)
+					or NULL, in which case this function
+					will invoke rec_get_offsets() */
+	const dict_table_t*	col_table,
+					/*!< in: table, to check which
+					externally stored columns
+					occur in the ordering columns
+					of an index, or NULL if
+					index->table should be
+					consulted instead; the user
+					columns in this table should be
+					the same columns as in index->table */
+	const dtuple_t*		defaults,
+					/*!< in: default values of
+					added, changed columns, or NULL */
+	const ulint*		col_map,/*!< in: mapping of old column
+					numbers to new ones, or NULL */
+	row_ext_t**		ext,	/*!< out, own: cache of
+					externally stored column
+					prefixes, or NULL */
+	mem_heap_t*		heap);	/*!< in: memory heap from which
+					the memory needed is allocated */
+
+/** An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index, with possible indexing on ongoing
+addition of new virtual columns.
+@param[in]	type		ROW_COPY_POINTERS or ROW_COPY_DATA;
+@param[in]	index		clustered index
+@param[in]	rec		record in the clustered index
+@param[in]	offsets		rec_get_offsets(rec,index) or NULL
+@param[in]	col_table	table, to check which
+				externally stored columns
+				occur in the ordering columns
+				of an index, or NULL if
+				index->table should be
+				consulted instead
+@param[in]	defaults	default values of added, changed columns, or NULL
+@param[in]	add_v		new virtual columns added
+				along with new indexes
+@param[in]	col_map		mapping of old column
+				numbers to new ones, or NULL
+@param[in]	ext		cache of externally stored column
+				prefixes, or NULL
+@param[in]	heap		memory heap from which
+				the memory needed is allocated
+@return own: row built */
+dtuple_t*
+row_build_w_add_vcol(
+	ulint			type,
+	const dict_index_t*	index,
+	const rec_t*		rec,
+	const rec_offs*		offsets,
+	const dict_table_t*	col_table,
+	const dtuple_t*		defaults,
+	const dict_add_v_col_t*	add_v,
+	const ulint*		col_map,
+	row_ext_t**		ext,
+	mem_heap_t*		heap);
+
+/*******************************************************************//**
+Converts an index record to a typed data tuple.
+@return index entry built; does not set info_bits, and the data fields
+in the entry will point directly to rec */
+dtuple_t*
+row_rec_to_index_entry_low(
+/*=======================*/
+	const rec_t*		rec,	/*!< in: record in the index */
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory needed is allocated */
+	MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************************//**
+Converts an index record to a typed data tuple. NOTE that externally
+stored (often big) fields are NOT copied to heap.
+@return own: index entry built */
+dtuple_t*
+row_rec_to_index_entry(
+/*===================*/
+	const rec_t*		rec,	/*!< in: record in the index */
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_offs*		offsets,/*!< in/out: rec_get_offsets(rec) */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory needed is allocated */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Convert a metadata record to a data tuple.
+@param[in]	rec		metadata record
+@param[in]	index		clustered index after instant ALTER TABLE
+@param[in]	offsets		rec_get_offsets(rec)
+@param[in,out]	heap		memory heap for allocations
+@param[in]	info_bits	the info_bits after an update
+@param[in]	pad		whether to pad to index->n_fields */
+dtuple_t*
+row_metadata_to_tuple(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	const rec_offs*		offsets,
+	mem_heap_t*		heap,
+	ulint			info_bits,
+	bool			pad)
+	MY_ATTRIBUTE((nonnull,warn_unused_result));
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record.
+@return own: row reference built; see the NOTE below! */
+dtuple_t*
+row_build_row_ref(
+/*==============*/
+	ulint		type,	/*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+				the former copies also the data fields to
+				heap, whereas the latter only places pointers
+				to data fields on the index page */
+	dict_index_t*	index,	/*!< in: secondary index */
+	const rec_t*	rec,	/*!< in: record in the index;
+				NOTE: in the case ROW_COPY_POINTERS
+				the data fields in the row will point
+				directly into this record, therefore,
+				the buffer page of this record must be
+				at least s-latched and the latch held
+				as long as the row reference is used! */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
+				needed is allocated */
+	MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+void
+row_build_row_ref_in_tuple(
+/*=======================*/
+	dtuple_t*		ref,	/*!< in/out: row reference built;
+					see the NOTE below! */
+	const rec_t*		rec,	/*!< in: record in the index;
+					NOTE: the data fields in ref
+					will point directly into this
+					record, therefore, the buffer
+					page of this record must be at
+					least s-latched and the latch
+					held as long as the row
+					reference is used! */
+	const dict_index_t*	index,	/*!< in: secondary index */
+	rec_offs*		offsets)/*!< in: rec_get_offsets(rec, index)
+					or NULL */
+	MY_ATTRIBUTE((nonnull(1,2,3)));
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INLINE
+void
+row_build_row_ref_fast(
+/*===================*/
+	dtuple_t*	ref,	/*!< in/out: typed data tuple where the
+				reference is built */
+	const ulint*	map,	/*!< in: array of field numbers in rec
+				telling how ref should be built from
+				the fields of rec */
+	const rec_t*	rec,	/*!< in: secondary index record;
+				must be preserved while ref is used, as we do
+				not copy field values to heap */
+	const rec_offs*	offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Searches the clustered index record for a row, if we have the row
+reference.
+@return true if found */
+bool
+row_search_on_row_ref(
+/*==================*/
+	btr_pcur_t*		pcur,	/*!< out: persistent cursor, which must
+					be closed by the caller */
+	btr_latch_mode		mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	const dict_table_t*	table,	/*!< in: table */
+	const dtuple_t*		ref,	/*!< in: row reference */
+	mtr_t*			mtr)	/*!< in/out: mtr */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Fetches the clustered index record for a secondary index record. The latches
+on the secondary index record are preserved.
+@return record or NULL, if no record found */
+rec_t*
+row_get_clust_rec(
+/*==============*/
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	const rec_t*	rec,	/*!< in: record in a secondary index */
+	dict_index_t*	index,	/*!< in: secondary index */
+	dict_index_t**	clust_index,/*!< out: clustered index */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Parse the integer data from specified data, which could be
+DATA_INT, DATA_FLOAT or DATA_DOUBLE. If the value is less than 0
+and the type is not unsigned then we reset the value to 0
+@param[in]	data		data to read
+@param[in]	len		length of data
+@param[in]	mtype		mtype of data
+@param[in]	unsigned_type	if the data is unsigned
+@return the integer value from the data */
+inline
+ib_uint64_t
+row_parse_int(
+	const byte*	data,
+	ulint		len,
+	ulint		mtype,
+	bool		unsigned_type);
+
+/** Result of row_search_index_entry */
+enum row_search_result {
+	ROW_FOUND = 0,		/*!< the record was found */
+	ROW_NOT_FOUND,		/*!< record not found */
+	ROW_BUFFERED,		/*!< one of BTR_INSERT, BTR_DELETE, or
+				BTR_DELETE_MARK was specified, the
+				secondary index leaf page was not in
+				the buffer pool, and the operation was
+				enqueued in the insert/delete buffer */
+	ROW_NOT_DELETED_REF	/*!< BTR_DELETE was specified, and
+				row_purge_poss_sec() failed */
+};
+
+/***************************************************************//**
+Searches an index record.
+@return whether the record was found or buffered */
+enum row_search_result
+row_search_index_entry(
+/*===================*/
+	const dtuple_t*	entry,	/*!< in: index entry */
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	btr_pcur_t*	pcur,	/*!< in/out: persistent cursor, which must
+				be closed by the caller */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#define ROW_COPY_DATA		1
+#define ROW_COPY_POINTERS	2
+
+/* The allowed latching order of index records is the following:
+(1) a secondary index record ->
+(2) the clustered index record ->
+(3) rollback segment data for the clustered index record. */
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) using
+"dict_field" and writes the result to "buf".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size is positive) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+ulint
+row_raw_format(
+/*===========*/
+	const char*		data,		/*!< in: raw data */
+	ulint			data_len,	/*!< in: raw data length
+						in bytes */
+	const dict_field_t*	dict_field,	/*!< in: index field */
+	char*			buf,		/*!< out: output buffer */
+	ulint			buf_size)	/*!< in: output buffer size
+						in bytes */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Prepare to start a mini-transaction to modify an index.
+@param[in,out]	mtr		mini-transaction
+@param[in,out]	index		possibly secondary index
+@param[in]	pessimistic	whether this is a pessimistic operation */
+inline
+void
+row_mtr_start(mtr_t* mtr, dict_index_t* index, bool pessimistic)
+{
+	mtr->start();
+
+	switch (index->table->space_id) {
+	case IBUF_SPACE_ID:
+		if (pessimistic
+		    && !(index->type & (DICT_UNIQUE | DICT_SPATIAL))) {
+			ibuf_free_excess_pages();
+		}
+		break;
+	case SRV_TMP_SPACE_ID:
+		mtr->set_log_mode(MTR_LOG_NO_REDO);
+		break;
+	default:
+		index->set_modified(*mtr);
+		break;
+	}
+
+	log_free_check();
+}
+
+#include "row0row.inl"
+
+#endif
diff --git a/storage/innobase/include/row0row.inl b/storage/innobase/include/row0row.inl
new file mode 100644
index 00000000..e89adb58
--- /dev/null
+++ b/storage/innobase/include/row0row.inl
@@ -0,0 +1,221 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2018, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0row.ic
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0dict.h"
+#include "rem0rec.h"
+#include "trx0undo.h"
+
+/*********************************************************************//**
+Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of
+a clustered index record.
+@return offset of DATA_TRX_ID */
+UNIV_INLINE
+ulint
+row_get_trx_id_offset(
+/*==================*/
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_offs*		offsets)/*!< in: record offsets */
+{
+	ulint	offset;
+	ulint	len;
+
+	ut_ad(rec_offs_validate(NULL, index, offsets));
+
+	offset = rec_get_nth_field_offs(offsets, index->db_trx_id(), &len);
+
+	ut_ad(len == DATA_TRX_ID_LEN);
+
+	return(offset);
+}
+
+/*********************************************************************//**
+Reads the trx id field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+trx_id_t
+row_get_rec_trx_id(
+/*===============*/
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_offs*		offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	ulint	offset;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	offset = index->trx_id_offset;
+
+	if (!offset) {
+		offset = row_get_trx_id_offset(index, offsets);
+	}
+
+	return(trx_read_trx_id(rec + offset));
+}
+
+/*********************************************************************//**
+Reads the roll pointer field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+roll_ptr_t
+row_get_rec_roll_ptr(
+/*=================*/
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_offs*		offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	ulint	offset;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	offset = index->trx_id_offset;
+
+	if (!offset) {
+		offset = row_get_trx_id_offset(index, offsets);
+	}
+
+	return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN));
+}
+
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged, or NULL if the
+externally stored columns in the clustered index record are
+unavailable and ext != NULL */
+UNIV_INLINE
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+	const dtuple_t*		row,	/*!< in: row which should be
+					inserted or purged */
+	const row_ext_t*	ext,	/*!< in: externally stored column
+					prefixes, or NULL */
+	const dict_index_t*	index,	/*!< in: index on the table */
+	mem_heap_t*		heap)	/*!< in,out: memory heap from which
+					the memory for the index entry
+					is allocated */
+{
+	dtuple_t*	entry;
+
+	ut_ad(dtuple_check_typed(row));
+	entry = row_build_index_entry_low(row, ext, index, heap,
+					  ROW_BUILD_NORMAL);
+	ut_ad(!entry || dtuple_check_typed(entry));
+	return(entry);
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INLINE
+void
+row_build_row_ref_fast(
+/*===================*/
+	dtuple_t*	ref,	/*!< in/out: typed data tuple where the
+				reference is built */
+	const ulint*	map,	/*!< in: array of field numbers in rec
+				telling how ref should be built from
+				the fields of rec */
+	const rec_t*	rec,	/*!< in: secondary index record;
+				must be preserved while ref is used, as we do
+				not copy field values to heap */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	dfield_t*	dfield;
+	const byte*	field;
+	ulint		len;
+	ulint		ref_len;
+	ulint		field_no;
+	ulint		i;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(!rec_offs_any_extern(offsets));
+	ref_len = dtuple_get_n_fields(ref);
+
+	for (i = 0; i < ref_len; i++) {
+		dfield = dtuple_get_nth_field(ref, i);
+
+		field_no = *(map + i);
+
+		if (field_no != ULINT_UNDEFINED) {
+
+			field = rec_get_nth_field(rec, offsets,
+						  field_no, &len);
+			dfield_set_data(dfield, field, len);
+		}
+	}
+}
+
+/** Parse the integer data from specified data, which could be
+DATA_INT, DATA_FLOAT or DATA_DOUBLE. If the value is less than 0
+and the type is not unsigned then we reset the value to 0
+@param[in]	data		data to read
+@param[in]	len		length of data
+@param[in]	mtype		mtype of data
+@param[in]	unsigned_type	if the data is unsigned
+@return the integer value from the data */
+ib_uint64_t
+row_parse_int(
+	const byte*	data,
+	ulint		len,
+	ulint		mtype,
+	bool		unsigned_type)
+{
+	ib_uint64_t	value = 0;
+
+	switch (mtype) {
+	case DATA_INT:
+
+		ut_a(len <= sizeof value);
+		value = mach_read_int_type(data, len, unsigned_type);
+		break;
+
+	case DATA_FLOAT:
+
+		ut_a(len == sizeof(float));
+		value = static_cast<ib_uint64_t>(mach_float_read(data));
+		break;
+
+	case DATA_DOUBLE:
+
+		ut_a(len == sizeof(double));
+		value = static_cast<ib_uint64_t>(mach_double_read(data));
+		break;
+
+	default:
+		ut_error;
+
+	}
+
+	if (!unsigned_type && static_cast<int64_t>(value) < 0) {
+		value = 0;
+	}
+
+	return(value);
+}
+
diff --git a/storage/innobase/include/row0sel.h b/storage/innobase/include/row0sel.h
new file mode 100644
index 00000000..8134c60f
--- /dev/null
+++ b/storage/innobase/include/row0sel.h
@@ -0,0 +1,457 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0sel.h
+Select
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "data0data.h"
+#include "que0types.h"
+#include "trx0types.h"
+#include "read0types.h"
+#include "row0types.h"
+#include "que0types.h"
+#include "pars0sym.h"
+#include "btr0pcur.h"
+#include "row0mysql.h"
+
+/*********************************************************************//**
+Creates a select node struct.
+@return own: select node struct */
+sel_node_t*
+sel_node_create(
+/*============*/
+	mem_heap_t*	heap);	/*!< in: memory heap where created */
+/*********************************************************************//**
+Frees the memory private to a select node when a query graph is freed,
+does not free the heap where the node was originally created. */
+void
+sel_node_free_private(
+/*==================*/
+	sel_node_t*	node);	/*!< in: select node struct */
+/*********************************************************************//**
+Frees a prefetch buffer for a column, including the dynamically allocated
+memory for data stored there. */
+void
+sel_col_prefetch_buf_free(
+/*======================*/
+	sel_buf_t*	prefetch_buf);	/*!< in, own: prefetch buffer */
+/**********************************************************************//**
+Performs a select step. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_sel_step(
+/*=========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs a fetch for a cursor.
+@return query thread to run next or NULL */
+que_thr_t*
+fetch_step(
+/*=======*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/***********************************************************//**
+Prints a row in a select result.
+@return query thread to run next or NULL */
+que_thr_t*
+row_printf_step(
+/*============*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+/** Copy used fields from cached row.
+Copy cache record field by field, don't touch fields that
+are not covered by current key.
+@param[out]	buf		Where to copy the MySQL row.
+@param[in]	cached_rec	What to copy (in MySQL row format).
+@param[in]	prebuilt	prebuilt struct. */
+void
+row_sel_copy_cached_fields_for_mysql(
+	byte*		buf,
+	const byte*	cached_rec,
+	row_prebuilt_t*	prebuilt);
+
+/****************************************************************//**
+Converts a key value stored in MySQL format to an Innobase dtuple. The last
+field of the key value may be just a prefix of a fixed length field: hence
+the parameter key_len. But currently we do not allow search keys where the
+last field is only a prefix of the full key field len and print a warning if
+such appears. */
+void
+row_sel_convert_mysql_key_to_innobase(
+/*==================================*/
+	dtuple_t*	tuple,		/*!< in/out: tuple where to build;
+					NOTE: we assume that the type info
+					in the tuple is already according
+					to index! */
+	byte*		buf,		/*!< in: buffer to use in field
+					conversions; NOTE that dtuple->data
+					may end up pointing inside buf so
+					do not discard that buffer while
+					the tuple is being used. See
+					row_mysql_store_col_in_innobase_format()
+					in the case of DATA_INT */
+	ulint		buf_len,	/*!< in: buffer length */
+	dict_index_t*	index,		/*!< in: index of the key value */
+	const byte*	key_ptr,	/*!< in: MySQL key value */
+	ulint		key_len);	/*!< in: MySQL key value length */
+
+
+/** Search for rows in the database using cursor.
+Function is mainly used for tables that are shared across connections and
+so it employs technique that can help re-construct the rows that
+transaction is suppose to see.
+It also has optimization such as pre-caching the rows, using AHI, etc.
+
+@param[out]	buf		buffer for the fetched row in MySQL format
+@param[in]	mode		search mode PAGE_CUR_L
+@param[in,out]	prebuilt	prebuilt struct for the table handler;
+				this contains the info to search_tuple,
+				index; if search tuple contains 0 field then
+				we position the cursor at start or the end of
+				index, depending on 'mode'
+@param[in]	match_mode	0 or ROW_SEL_EXACT or ROW_SEL_EXACT_PREFIX
+@param[in]	direction	0 or ROW_SEL_NEXT or ROW_SEL_PREV;
+				Note: if this is != 0, then prebuilt must has a
+				pcur with stored position! In opening of a
+				cursor 'direction' should be 0.
+@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
+DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */
+dberr_t
+row_search_mvcc(
+	byte*		buf,
+	page_cur_mode_t	mode,
+	row_prebuilt_t*	prebuilt,
+	ulint		match_mode,
+	ulint		direction)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/********************************************************************//**
+Count rows in a R-Tree leaf level.
+@return DB_SUCCESS if successful */
+dberr_t
+row_count_rtree_recs(
+/*=================*/
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct for the
+					table handle; this contains the info
+					of search_tuple, index; if search
+					tuple contains 0 fields then we
+					position the cursor at the start or
+					the end of the index, depending on
+					'mode' */
+	ulint*		n_rows);	/*!< out: number of entries
+					seen in the consistent read */
+
+/**
+Check the index records in CHECK TABLE.
+The index must contain entries in an ascending order,
+unique constraint must not be violated by duplicated keys,
+and the number of index entries is counted in according to the
+current read view.
+
+@param prebuilt    index and transaction
+@param n_rows      number of records counted
+
+@return error code
+@retval DB_SUCCESS  if no error was found */
+dberr_t row_check_index(row_prebuilt_t *prebuilt, ulint *n_rows)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Read the max AUTOINC value from an index.
+@param[in] index	index starting with an AUTO_INCREMENT column
+@return	the largest AUTO_INCREMENT value
+@retval	0	if no records were found */
+ib_uint64_t
+row_search_max_autoinc(dict_index_t* index)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** A structure for caching column values for prefetched rows */
+struct sel_buf_t{
+	byte*		data;	/*!< data, or NULL; if not NULL, this field
+				has allocated memory which must be explicitly
+				freed; can be != NULL even when len is
+				UNIV_SQL_NULL */
+	ulint		len;	/*!< data length or UNIV_SQL_NULL */
+	ulint		val_buf_size;
+				/*!< size of memory buffer allocated for data:
+				this can be more than len; this is defined
+				when data != NULL */
+};
+
+/** Copy used fields from cached row.
+Copy cache record field by field, don't touch fields that
+are not covered by current key.
+@param[out]     buf             Where to copy the MySQL row.
+@param[in]      cached_rec      What to copy (in MySQL row format).
+@param[in]      prebuilt        prebuilt struct. */
+void
+row_sel_copy_cached_fields_for_mysql(
+        byte*           buf,
+        const byte*     cached_rec,
+        row_prebuilt_t* prebuilt);
+
+/** Query plan */
+struct plan_t{
+	dict_table_t*	table;		/*!< table struct in the dictionary
+					cache */
+	dict_index_t*	index;		/*!< table index used in the search */
+	btr_pcur_t	pcur;		/*!< persistent cursor used to search
+					the index */
+	ibool		asc;		/*!< TRUE if cursor traveling upwards */
+	ibool		pcur_is_open;	/*!< TRUE if pcur has been positioned
+					and we can try to fetch new rows */
+	ibool		cursor_at_end;	/*!< TRUE if the cursor is open but
+					we know that there are no more
+					qualifying rows left to retrieve from
+					the index tree; NOTE though, that
+					there may still be unprocessed rows in
+					the prefetch stack; always FALSE when
+					pcur_is_open is FALSE */
+	ibool		stored_cursor_rec_processed;
+					/*!< TRUE if the pcur position has been
+					stored and the record it is positioned
+					on has already been processed */
+	que_node_t**	tuple_exps;	/*!< array of expressions
+					which are used to calculate
+					the field values in the search
+					tuple: there is one expression
+					for each field in the search
+					tuple */
+	dtuple_t*	tuple;		/*!< search tuple */
+	page_cur_mode_t	mode;		/*!< search mode: PAGE_CUR_G, ... */
+	ulint		n_exact_match;	/*!< number of first fields in
+					the search tuple which must be
+					exactly matched */
+	ibool		unique_search;	/*!< TRUE if we are searching an
+					index record with a unique key */
+	ulint		n_rows_fetched;	/*!< number of rows fetched using pcur
+					after it was opened */
+	ulint		n_rows_prefetched;/*!< number of prefetched rows cached
+					for fetch: fetching several rows in
+					the same mtr saves CPU time */
+	ulint		first_prefetched;/*!< index of the first cached row in
+					select buffer arrays for each column */
+	ibool		no_prefetch;	/*!< no prefetch for this table */
+	sym_node_list_t	columns;	/*!< symbol table nodes for the columns
+					to retrieve from the table */
+	UT_LIST_BASE_NODE_T(func_node_t)
+			end_conds;	/*!< conditions which determine the
+					fetch limit of the index segment we
+					have to look at: when one of these
+					fails, the result set has been
+					exhausted for the cursor in this
+					index; these conditions are normalized
+					so that in a comparison the column
+					for this table is the first argument */
+	UT_LIST_BASE_NODE_T(func_node_t)
+			other_conds;	/*!< the rest of search conditions we can
+					test at this table in a join */
+	ibool		must_get_clust;	/*!< TRUE if index is a non-clustered
+					index and we must also fetch the
+					clustered index record; this is the
+					case if the non-clustered record does
+					not contain all the needed columns, or
+					if this is a single-table explicit
+					cursor, or a searched update or
+					delete */
+	ulint*		clust_map;	/*!< map telling how clust_ref is built
+					from the fields of a non-clustered
+					record */
+	dtuple_t*	clust_ref;	/*!< the reference to the clustered
+					index entry is built here if index is
+					a non-clustered index */
+	btr_pcur_t	clust_pcur;	/*!< if index is non-clustered, we use
+					this pcur to search the clustered
+					index */
+	mem_heap_t*	old_vers_heap;	/*!< memory heap used in building an old
+					version of a row, or NULL */
+};
+
+/** Select node states */
+enum sel_node_state {
+	SEL_NODE_CLOSED,	/*!< it is a declared cursor which is not
+				currently open */
+	SEL_NODE_OPEN,		/*!< intention locks not yet set on tables */
+	SEL_NODE_FETCH,		/*!< intention locks have been set */
+	SEL_NODE_NO_MORE_ROWS	/*!< cursor has reached the result set end */
+};
+
+/** Select statement node */
+struct sel_node_t{
+	que_common_t	common;		/*!< node type: QUE_NODE_SELECT */
+	enum sel_node_state
+			state;	/*!< node state */
+	que_node_t*	select_list;	/*!< select list */
+	sym_node_t*	into_list;	/*!< variables list or NULL */
+	sym_node_t*	table_list;	/*!< table list */
+	ibool		asc;		/*!< TRUE if the rows should be fetched
+					in an ascending order */
+	ibool		set_x_locks;	/*!< TRUE if the cursor is for update or
+					delete, which means that a row x-lock
+					should be placed on the cursor row */
+	lock_mode	row_lock_mode;	/*!< LOCK_X or LOCK_S */
+	ulint		n_tables;	/*!< number of tables */
+	ulint		fetch_table;	/*!< number of the next table to access
+					in the join */
+	plan_t*		plans;		/*!< array of n_tables many plan nodes
+					containing the search plan and the
+					search data structures */
+	que_node_t*	search_cond;	/*!< search condition */
+	ReadView*	read_view;	/*!< if the query is a non-locking
+					consistent read, its read view is
+					placed here, otherwise NULL */
+	ibool		consistent_read;/*!< TRUE if the select is a consistent,
+					non-locking read */
+	order_node_t*	order_by;	/*!< order by column definition, or
+					NULL */
+	ibool		is_aggregate;	/*!< TRUE if the select list consists of
+					aggregate functions */
+	ibool		aggregate_already_fetched;
+					/*!< TRUE if the aggregate row has
+					already been fetched for the current
+					cursor */
+	ibool		can_get_updated;/*!< this is TRUE if the select
+					is in a single-table explicit
+					cursor which can get updated
+					within the stored procedure,
+					or in a searched update or
+					delete; NOTE that to determine
+					of an explicit cursor if it
+					can get updated, the parser
+					checks from a stored procedure
+					if it contains positioned
+					update or delete statements */
+	sym_node_t*	explicit_cursor;/*!< not NULL if an explicit cursor */
+	UT_LIST_BASE_NODE_T(sym_node_t)
+			copy_variables; /*!< variables whose values we have to
+					copy when an explicit cursor is opened,
+					so that they do not change between
+					fetches */
+};
+
+/**
+Get the plan node for a table in a join.
+@param node  query graph node for SELECT
+@param i     plan node element
+@return ith plan node */
+inline plan_t *sel_node_get_nth_plan(sel_node_t *node, ulint i)
+{
+  ut_ad(i < node->n_tables);
+  return &node->plans[i];
+}
+
+/** Fetch statement node */
+struct fetch_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_FETCH */
+	sel_node_t*	cursor_def;	/*!< cursor definition */
+	sym_node_t*	into_list;	/*!< variables to set */
+
+	pars_user_func_t*
+			func;		/*!< User callback function or NULL.
+					The first argument to the function
+					is a sel_node_t*, containing the
+					results of the SELECT operation for
+					one row. If the function returns
+					NULL, it is not interested in
+					further rows and the cursor is
+					modified so (cursor % NOTFOUND) is
+					true. If it returns not-NULL,
+					continue normally. */
+};
+
+/** Open or close cursor operation type */
+enum open_node_op {
+	ROW_SEL_OPEN_CURSOR,	/*!< open cursor */
+	ROW_SEL_CLOSE_CURSOR	/*!< close cursor */
+};
+
+/** Open or close cursor statement node */
+struct open_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_OPEN */
+	enum open_node_op
+			op_type;	/*!< operation type: open or
+					close cursor */
+	sel_node_t*	cursor_def;	/*!< cursor definition */
+};
+
+/** Row printf statement node */
+struct row_printf_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_ROW_PRINTF */
+	sel_node_t*	sel_node;	/*!< select */
+};
+
+/** Search direction for the MySQL interface */
+enum row_sel_direction {
+	ROW_SEL_NEXT = 1,	/*!< ascending direction */
+	ROW_SEL_PREV = 2	/*!< descending direction */
+};
+
+/** Match mode for the MySQL interface */
+enum row_sel_match_mode {
+	ROW_SEL_EXACT = 1,	/*!< search using a complete key value */
+	ROW_SEL_EXACT_PREFIX	/*!< search using a key prefix which
+				must match rows: the prefix may
+				contain an incomplete field (the last
+				field in prefix may be just a prefix
+				of a fixed length column) */
+};
+
+#ifdef UNIV_DEBUG
+/** Convert a non-SQL-NULL field from Innobase format to MySQL format. */
+# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \
+        row_sel_field_store_in_mysql_format_func(dest,templ,idx,field,src,len)
+#else /* UNIV_DEBUG */
+/** Convert a non-SQL-NULL field from Innobase format to MySQL format. */
+# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \
+        row_sel_field_store_in_mysql_format_func(dest,templ,src,len)
+#endif /* UNIV_DEBUG */
+
+/**************************************************************//**
+Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
+function is row_mysql_store_col_in_innobase_format() in row0mysql.cc. */
+
+void
+row_sel_field_store_in_mysql_format_func(
+/*=====================================*/
+        byte*           dest,   /*!< in/out: buffer where to store; NOTE
+                                that BLOBs are not in themselves
+                                stored here: the caller must allocate
+                                and copy the BLOB into buffer before,
+                                and pass the pointer to the BLOB in
+                                'data' */
+        const mysql_row_templ_t* templ,
+                                /*!< in: MySQL column template.
+                                Its following fields are referenced:
+                                type, is_unsigned, mysql_col_len,
+                                mbminlen, mbmaxlen */
+#ifdef UNIV_DEBUG
+        const dict_index_t* index,
+                                /*!< in: InnoDB index */
+        ulint           field_no,
+                                /*!< in: templ->rec_field_no or
+                                templ->clust_rec_field_no or
+                                templ->icp_rec_field_no */
+#endif /* UNIV_DEBUG */
+        const byte*     data,   /*!< in: data to store */
+        ulint           len);    /*!< in: length of the data */
diff --git a/storage/innobase/include/row0types.h b/storage/innobase/include/row0types.h
new file mode 100644
index 00000000..5e737c1c
--- /dev/null
+++ b/storage/innobase/include/row0types.h
@@ -0,0 +1,54 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0types.h
+Row operation global types
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "buf0types.h"
+
+struct plan_t;
+
+struct upd_t;
+struct upd_field_t;
+struct upd_node_t;
+struct del_node_t;
+struct ins_node_t;
+struct sel_node_t;
+struct open_node_t;
+struct fetch_node_t;
+
+struct row_printf_node_t;
+struct sel_buf_t;
+
+struct undo_node_t;
+
+struct purge_node_t;
+
+struct row_ext_t;
+
+/** Buffer for logging modifications during online index creation */
+struct row_log_t;
+
+/* MySQL data types */
+struct TABLE;
diff --git a/storage/innobase/include/row0uins.h b/storage/innobase/include/row0uins.h
new file mode 100644
index 00000000..a9877969
--- /dev/null
+++ b/storage/innobase/include/row0uins.h
@@ -0,0 +1,50 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0uins.h
+Fresh insert undo
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0uins_h
+#define row0uins_h
+
+#include "data0data.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/***********************************************************//**
+Undoes a fresh insert of a row to a table. A fresh insert means that
+the same clustered index unique key did not have any record, even delete
+marked, at the time of the insert.  InnoDB is eager in a rollback:
+if it figures out that an index record will be removed in the purge
+anyway, it will remove it in the rollback.
+@return DB_SUCCESS */
+dberr_t
+row_undo_ins(
+/*=========*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#endif
diff --git a/storage/innobase/include/row0umod.h b/storage/innobase/include/row0umod.h
new file mode 100644
index 00000000..5032e103
--- /dev/null
+++ b/storage/innobase/include/row0umod.h
@@ -0,0 +1,46 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0umod.h
+Undo modify of a row
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0umod_h
+#define row0umod_h
+
+#include "data0data.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/***********************************************************//**
+Undoes a modify operation on a row of a table.
+@return DB_SUCCESS or error code */
+dberr_t
+row_undo_mod(
+/*=========*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+	MY_ATTRIBUTE((warn_unused_result));
+
+#endif
diff --git a/storage/innobase/include/row0undo.h b/storage/innobase/include/row0undo.h
new file mode 100644
index 00000000..ae067a8a
--- /dev/null
+++ b/storage/innobase/include/row0undo.h
@@ -0,0 +1,114 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0undo.h
+Row undo
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0undo_h
+#define row0undo_h
+
+#include "trx0sys.h"
+#include "btr0types.h"
+#include "btr0pcur.h"
+#include "que0types.h"
+#include "row0types.h"
+
+/********************************************************************//**
+Creates a row undo node to a query graph.
+@return own: undo node */
+undo_node_t*
+row_undo_node_create(
+/*=================*/
+	trx_t*		trx,	/*!< in: transaction */
+	que_thr_t*	parent,	/*!< in: parent node, i.e., a thr node */
+	mem_heap_t*	heap);	/*!< in: memory heap where created */
+/***********************************************************//**
+Looks for the clustered index record when node has the row reference.
+The pcur in node is used in the search. If found, stores the row to node,
+and stores the position of pcur, and detaches it. The pcur must be closed
+by the caller in any case.
+@return true if found; NOTE the node->pcur must be closed by the
+caller, regardless of the return value */
+bool
+row_undo_search_clust_to_pcur(
+/*==========================*/
+	undo_node_t*	node)	/*!< in/out: row undo node */
+	MY_ATTRIBUTE((warn_unused_result));
+/***********************************************************//**
+Undoes a row operation in a table. This is a high-level function used
+in SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_undo_step(
+/*==========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+/* A single query thread will try to perform the undo for all successive
+versions of a clustered index record, if the transaction has modified it
+several times during the execution which is rolled back. It may happen
+that the task is transferred to another query thread, if the other thread
+is assigned to handle an undo log record in the chain of different versions
+of the record, and the other thread happens to get the x-latch to the
+clustered index record at the right time.
+	If a query thread notices that the clustered index record it is looking
+for is missing, or the roll ptr field in the record doed not point to the
+undo log record the thread was assigned to handle, then it gives up the undo
+task for that undo log record, and fetches the next. This situation can occur
+just in the case where the transaction modified the same record several times
+and another thread is currently doing the undo for successive versions of
+that index record. */
+
+/** Undo node structure */
+struct undo_node_t{
+	que_common_t	common;	/*!< node type: QUE_NODE_UNDO */
+	bool		is_temp;/*!< whether this is a temporary table */
+	trx_t*		trx;	/*!< trx for which undo is done */
+	roll_ptr_t	roll_ptr;/*!< roll pointer to undo log record */
+	trx_undo_rec_t*	undo_rec;/*!< undo log record */
+	undo_no_t	undo_no;/*!< undo number of the record */
+	byte		rec_type;/*!< undo log record type: TRX_UNDO_INSERT_REC,
+				... */
+	trx_id_t	new_trx_id; /*!< trx id to restore to clustered index
+				record */
+	btr_pcur_t	pcur;	/*!< persistent cursor used in searching the
+				clustered index record */
+	dict_table_t*	table;	/*!< table where undo is done */
+	ulint		cmpl_info;/*!< compiler analysis of an update */
+	upd_t*		update;	/*!< update vector for a clustered index
+				record */
+	const dtuple_t*	ref;	/*!< row reference to the next row to handle */
+	dtuple_t*	row;	/*!< a copy (also fields copied to heap) of the
+				row to handle */
+	row_ext_t*	ext;	/*!< NULL, or prefixes of the externally
+				stored columns of the row */
+	dtuple_t*	undo_row;/*!< NULL, or the row after undo */
+	row_ext_t*	undo_ext;/*!< NULL, or prefixes of the externally
+				stored columns of undo_row */
+	dict_index_t*	index;	/*!< the next index whose record should be
+				handled */
+	mem_heap_t*	heap;	/*!< memory heap used as auxiliary storage for
+				row; this must be emptied after undo is tried
+				on a row */
+};
+
+#endif
diff --git a/storage/innobase/include/row0upd.h b/storage/innobase/include/row0upd.h
new file mode 100644
index 00000000..f60fc359
--- /dev/null
+++ b/storage/innobase/include/row0upd.h
@@ -0,0 +1,559 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0upd.h
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0upd_h
+#define row0upd_h
+
+#include "data0data.h"
+#include "rem0types.h"
+#include "row0types.h"
+#include "btr0types.h"
+#include "trx0types.h"
+#include "btr0pcur.h"
+#include "que0types.h"
+#include "pars0types.h"
+
+/*********************************************************************//**
+Creates an update vector object.
+@return own: update vector object */
+UNIV_INLINE
+upd_t*
+upd_create(
+/*=======*/
+	ulint		n,	/*!< in: number of fields */
+	mem_heap_t*	heap);	/*!< in: heap from which memory allocated */
+/*********************************************************************//**
+Returns the number of fields in the update vector == number of columns
+to be updated by an update vector.
+@return number of fields */
+UNIV_INLINE
+ulint
+upd_get_n_fields(
+/*=============*/
+	const upd_t*	update);	/*!< in: update vector */
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the nth field of an update vector.
+@return update vector field */
+UNIV_INLINE
+upd_field_t*
+upd_get_nth_field(
+/*==============*/
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		n);	/*!< in: field position in update vector */
+#else
+# define upd_get_nth_field(update, n) ((update)->fields + (n))
+#endif
+
+/*********************************************************************//**
+Sets an index field number to be updated by an update vector field. */
+UNIV_INLINE
+void
+upd_field_set_field_no(
+/*===================*/
+	upd_field_t*	upd_field,	/*!< in: update vector field */
+	uint16_t	field_no,	/*!< in: field number in a clustered
+					index */
+	dict_index_t*	index);
+
+/** set field number to a update vector field, marks this field is updated
+@param[in,out]	upd_field	update vector field
+@param[in]	field_no	virtual column sequence num
+@param[in]	index		index */
+UNIV_INLINE
+void
+upd_field_set_v_field_no(
+	upd_field_t*	upd_field,
+	uint16_t	field_no,
+	dict_index_t*	index);
+/*********************************************************************//**
+Returns a field of an update vector by field_no.
+@return update vector field, or NULL */
+UNIV_INLINE
+const upd_field_t*
+upd_get_field_by_field_no(
+/*======================*/
+	const upd_t*	update,	/*!< in: update vector */
+	uint16_t	no,	/*!< in: field_no */
+	bool		is_virtual) /*!< in: if it is a virtual column */
+	MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************************//**
+Creates an update node for a query graph.
+@return own: update node */
+upd_node_t*
+upd_node_create(
+/*============*/
+	mem_heap_t*	heap);	/*!< in: mem heap where created */
+/***********************************************************//**
+Returns TRUE if row update changes size of some field in index or if some
+field to be updated is stored externally in rec or update.
+@return TRUE if the update changes the size of some field in index or
+the field is external in rec or update */
+ibool
+row_upd_changes_field_size_or_external(
+/*===================================*/
+	dict_index_t*	index,	/*!< in: index */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update);/*!< in: update vector */
+
+/***************************************************************//**
+Builds an update vector from those fields which in a secondary index entry
+differ from a record that has the equal ordering fields. NOTE: we compare
+the fields as binary strings!
+@return own: update vector of differing fields */
+upd_t*
+row_upd_build_sec_rec_difference_binary(
+/*====================================*/
+	const rec_t*	rec,	/*!< in: secondary index record */
+	dict_index_t*	index,	/*!< in: index */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
+	MY_ATTRIBUTE((warn_unused_result, nonnull));
+/** Builds an update vector from those fields, excluding the roll ptr and
+trx id fields, which in an index entry differ from a record that has
+the equal ordering fields. NOTE: we compare the fields as binary strings!
+@param[in]	index		clustered index
+@param[in]	entry		clustered index entry to insert
+@param[in]	rec		clustered index record
+@param[in]	offsets		rec_get_offsets(rec,index), or NULL
+@param[in]	no_sys		skip the system columns
+				DB_TRX_ID and DB_ROLL_PTR
+@param[in]	ignore_warnings ignore warnings during vcol calculation, which
+				means that this calculation is internal only
+@param[in]	trx		transaction (for diagnostics),
+				or NULL
+@param[in]	heap		memory heap from which allocated
+@param[in,out]	mysql_table	NULL, or mysql table object when
+				user thread invokes dml
+@param[out]	error		error number in case of failure
+@return own: update vector of differing fields, excluding roll ptr and
+trx id */
+upd_t*
+row_upd_build_difference_binary(
+	dict_index_t*	index,
+	const dtuple_t*	entry,
+	const rec_t*	rec,
+	const rec_offs*	offsets,
+	bool		no_sys,
+	bool		ignore_warnings,
+	trx_t*		trx,
+	mem_heap_t*	heap,
+	TABLE*		mysql_table,
+	dberr_t*	error)
+	MY_ATTRIBUTE((nonnull(1,2,3,8,10), warn_unused_result));
+/** Apply an update vector to an index entry.
+@param[in,out]	entry	index entry to be updated; the clustered index record
+			must be covered by a lock or a page latch to prevent
+			deletion (rollback or purge)
+@param[in]	index	index of the entry
+@param[in]	update	update vector built for the entry
+@param[in,out]	heap	memory heap for copying off-page columns */
+void
+row_upd_index_replace_new_col_vals_index_pos(
+	dtuple_t*		entry,
+	const dict_index_t*	index,
+	const upd_t*		update,
+	mem_heap_t*		heap)
+	MY_ATTRIBUTE((nonnull));
+/** Replace the new column values stored in the update vector,
+during trx_undo_prev_version_build().
+@param entry   clustered index tuple where the values are replaced
+               (the clustered index leaf page latch must be held)
+@param index   clustered index
+@param update  update vector for the clustered index
+@param heap    memory heap for allocating and copying values
+@return whether the previous version was built successfully */
+bool
+row_upd_index_replace_new_col_vals(dtuple_t *entry, const dict_index_t &index,
+                                   const upd_t *update, mem_heap_t *heap)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************//**
+Replaces the new column values stored in the update vector. */
+void
+row_upd_replace(
+/*============*/
+	dtuple_t*		row,	/*!< in/out: row where replaced,
+					indexed by col_no;
+					the clustered index record must be
+					covered by a lock or a page latch to
+					prevent deletion (rollback or purge) */
+	row_ext_t**		ext,	/*!< out, own: NULL, or externally
+					stored column prefixes */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const upd_t*		update,	/*!< in: an update vector built for the
+					clustered index */
+	mem_heap_t*		heap);	/*!< in: memory heap */
+/** Replaces the virtual column values stored in a dtuple with that of
+a update vector.
+@param[in,out]	row	dtuple whose column to be updated
+@param[in]	table	table
+@param[in]	update	an update vector built for the clustered index
+@param[in]	upd_new	update to new or old value
+@param[in,out]	undo_row undo row (if needs to be updated)
+@param[in]	ptr	remaining part in update undo log */
+void
+row_upd_replace_vcol(
+	dtuple_t*		row,
+	const dict_table_t*	table,
+	const upd_t*		update,
+	bool			upd_new,
+	dtuple_t*		undo_row,
+	const byte*		ptr);
+
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector changes an ordering field in the index record */
+ibool
+row_upd_changes_ord_field_binary_func(
+/*==================================*/
+	dict_index_t*	index,	/*!< in: index of the record */
+	const upd_t*	update,	/*!< in: update vector for the row; NOTE: the
+				field numbers in this MUST be clustered index
+				positions! */
+#ifdef UNIV_DEBUG
+	const que_thr_t*thr,	/*!< in: query thread */
+#endif /* UNIV_DEBUG */
+	const dtuple_t*	row,	/*!< in: old value of row, or NULL if the
+				row and the data values in update are not
+				known when this function is called, e.g., at
+				compile time */
+	const row_ext_t*ext,	/*!< NULL, or prefixes of the externally
+				stored columns in the old row */
+	ulint		flag)	/*!< in: ROW_BUILD_NORMAL,
+				ROW_BUILD_FOR_PURGE or ROW_BUILD_FOR_UNDO */
+	MY_ATTRIBUTE((nonnull(1,2), warn_unused_result));
+#ifdef UNIV_DEBUG
+# define row_upd_changes_ord_field_binary(index,update,thr,row,ext)	\
+	row_upd_changes_ord_field_binary_func(index,update,thr,row,ext,0)
+#else /* UNIV_DEBUG */
+# define row_upd_changes_ord_field_binary(index,update,thr,row,ext)	\
+	row_upd_changes_ord_field_binary_func(index,update,row,ext,0)
+#endif /* UNIV_DEBUG */
+/***********************************************************//**
+Checks if an FTS indexed column is affected by an UPDATE.
+@return offset within fts_t::indexes if FTS indexed column updated else
+ULINT_UNDEFINED */
+ulint
+row_upd_changes_fts_column(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	upd_field);	/*!< in: field to check */
+/***********************************************************//**
+Checks if an FTS Doc ID column is affected by an UPDATE.
+@return whether Doc ID column is affected */
+bool
+row_upd_changes_doc_id(
+/*===================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	upd_field)	/*!< in: field to check */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector may change an ordering field in an index
+record */
+ibool
+row_upd_changes_some_index_ord_field_binary(
+/*========================================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const upd_t*		update);/*!< in: update vector for the row */
+/***********************************************************//**
+Updates a row in a table. This is a high-level function used
+in SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_upd_step(
+/*=========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+/* Update vector field */
+struct upd_field_t{
+	uint16_t	field_no;	/*!< field number in an index, usually
+					the clustered index, but in updating
+					a secondary index record in btr0cur.cc
+					this is the position in the secondary
+					index. If this field is a virtual
+					column, then field_no represents
+					the nth virtual	column in the table */
+	uint16_t	orig_len;	/*!< original length of the locally
+					stored part of an externally stored
+					column, or 0 */
+	que_node_t*	exp;		/*!< expression for calculating a new
+					value: it refers to column values and
+					constants in the symbol table of the
+					query graph */
+	dfield_t	new_val;	/*!< new value for the column */
+	dfield_t*	old_v_val;	/*!< old value for the virtual column */
+};
+
+
+/* check whether an update field is on virtual column */
+#define upd_fld_is_virtual_col(upd_fld)			\
+	(((upd_fld)->new_val.type.prtype & DATA_VIRTUAL) == DATA_VIRTUAL)
+
+/* set DATA_VIRTUAL bit on update field to show it is a virtual column */
+#define upd_fld_set_virtual_col(upd_fld)			\
+	((upd_fld)->new_val.type.prtype |= DATA_VIRTUAL)
+
+/* Update vector structure */
+struct upd_t{
+	mem_heap_t*	heap;		/*!< heap from which memory allocated */
+	byte		info_bits;	/*!< new value of info bits to record;
+					default is 0 */
+	dtuple_t*	old_vrow;	/*!< pointer to old row, used for
+					virtual column update now */
+	ulint		n_fields;	/*!< number of update fields */
+	upd_field_t*	fields;		/*!< array of update fields */
+	byte		vers_sys_value[8]; /*!< buffer for updating system fields */
+
+	/** Append an update field to the end of array
+	@param[in]	field	an update field */
+	void append(const upd_field_t& field)
+	{
+		fields[n_fields++] = field;
+	}
+
+        void remove_element(ulint i)
+        {
+          ut_ad(n_fields > 0);
+          ut_ad(i < n_fields);
+          while (i < n_fields - 1)
+          {
+            fields[i]= fields[i + 1];
+            i++;
+          }
+          n_fields--;
+        }
+
+        bool remove(const ulint field_no)
+        {
+          for (ulint i= 0; i < n_fields; ++i)
+          {
+            if (field_no == fields[i].field_no)
+            {
+              remove_element(i);
+              return true;
+            }
+          }
+          return false;
+        }
+
+        /** Determine if the given field_no is modified.
+	@return true if modified, false otherwise.  */
+	bool is_modified(uint16_t field_no) const
+	{
+		for (ulint i = 0; i < n_fields; ++i) {
+			if (field_no == fields[i].field_no) {
+				return(true);
+			}
+		}
+		return(false);
+	}
+
+	/** Determine if the update affects a system versioned column or row_end. */
+	bool affects_versioned() const
+	{
+		for (ulint i = 0; i < n_fields; i++) {
+			dtype_t type = fields[i].new_val.type;
+			if (type.is_versioned()) {
+				return true;
+			}
+			// versioned DELETE is UPDATE SET row_end=NOW
+			if (type.vers_sys_end()) {
+				return true;
+			}
+		}
+		return false;
+	}
+
+	/** @return whether this is for a hidden metadata record
+	for instant ALTER TABLE */
+	bool is_metadata() const { return dtuple_t::is_metadata(info_bits); }
+	/** @return whether this is for a hidden metadata record
+	for instant ALTER TABLE (not only ADD COLUMN) */
+	bool is_alter_metadata() const
+	{ return dtuple_t::is_alter_metadata(info_bits); }
+
+#ifdef UNIV_DEBUG
+        bool validate() const
+        {
+                for (ulint i = 0; i < n_fields; ++i) {
+                        dfield_t* field = &fields[i].new_val;
+                        if (dfield_is_ext(field)) {
+				ut_ad(dfield_get_len(field)
+				      >= BTR_EXTERN_FIELD_REF_SIZE);
+                        }
+                }
+                return(true);
+        }
+#endif // UNIV_DEBUG
+};
+
+/** Kinds of update operation */
+enum delete_mode_t {
+	NO_DELETE = 0,		/*!< this operation does not delete */
+	PLAIN_DELETE,		/*!< ordinary delete */
+	VERSIONED_DELETE	/*!< update old and insert a new row */
+};
+
+/* Update node structure which also implements the delete operation
+of a row */
+
+struct upd_node_t{
+	que_common_t	common;	/*!< node type: QUE_NODE_UPDATE */
+	delete_mode_t	is_delete;	/*!< kind of DELETE */
+	ibool		searched_update;
+				/* TRUE if searched update, FALSE if
+				positioned */
+	bool		in_mysql_interface;
+				/* whether the update node was created
+				for the MySQL interface */
+	dict_foreign_t*	foreign;/* NULL or pointer to a foreign key
+				constraint if this update node is used in
+				doing an ON DELETE or ON UPDATE operation */
+	upd_node_t*	cascade_node;/* NULL or an update node template which
+				is used to implement ON DELETE/UPDATE CASCADE
+				or ... SET NULL for foreign keys */
+	mem_heap_t*	cascade_heap;
+				/*!< NULL or a mem heap where cascade
+				node is created.*/
+	sel_node_t*	select;	/*!< query graph subtree implementing a base
+				table cursor: the rows returned will be
+				updated */
+	btr_pcur_t*	pcur;	/*!< persistent cursor placed on the clustered
+				index record which should be updated or
+				deleted; the cursor is stored in the graph
+				of 'select' field above, except in the case
+				of the MySQL interface */
+	dict_table_t*	table;	/*!< table where updated */
+	upd_t*		update;	/*!< update vector for the row */
+	ulint		update_n_fields;
+				/* when this struct is used to implement
+				a cascade operation for foreign keys, we store
+				here the size of the buffer allocated for use
+				as the update vector */
+	sym_node_list_t	columns;/* symbol table nodes for the columns
+				to retrieve from the table */
+	ibool		has_clust_rec_x_lock;
+				/* TRUE if the select which retrieves the
+				records to update already sets an x-lock on
+				the clustered record; note that it must always
+				set at least an s-lock */
+	ulint		cmpl_info;/* information extracted during query
+				compilation; speeds up execution:
+				UPD_NODE_NO_ORD_CHANGE and
+				UPD_NODE_NO_SIZE_CHANGE, ORed */
+	/*----------------------*/
+	/* Local storage for this graph node */
+	ulint		state;	/*!< node execution state */
+	dict_index_t*	index;	/*!< NULL, or the next index whose record should
+				be updated */
+	dtuple_t*	row;	/*!< NULL, or a copy (also fields copied to
+				heap) of the row to update; this must be reset
+				to NULL after a successful update */
+	dtuple_t*	historical_row;	/*!< historical row used in
+				CASCADE UPDATE/SET NULL;
+				allocated from historical_heap  */
+	mem_heap_t*	historical_heap; /*!< heap for historical row insertion;
+				created when row to update is located;
+				freed right before row update */
+	row_ext_t*	ext;	/*!< NULL, or prefixes of the externally
+				stored columns in the old row */
+	dtuple_t*	upd_row;/* NULL, or a copy of the updated row */
+	row_ext_t*	upd_ext;/* NULL, or prefixes of the externally
+				stored columns in upd_row */
+	mem_heap_t*	heap;	/*!< memory heap used as auxiliary storage;
+				this must be emptied after a successful
+				update */
+	/*----------------------*/
+	sym_node_t*	table_sym;/* table node in symbol table */
+	que_node_t*	col_assign_list;
+				/* column assignment list */
+	ulint		magic_n;
+
+private:
+	/** Appends row_start or row_end field to update vector and sets a
+	CURRENT_TIMESTAMP/trx->id value to it.
+	Supposed to be called only by make_versioned_update() and
+	make_versioned_delete().
+	@param[in]	trx	transaction
+	@param[in]	vers_sys_idx	table->row_start or table->row_end */
+  void vers_update_fields(const trx_t *trx, ulint idx);
+
+public:
+	/** Also set row_start = CURRENT_TIMESTAMP/trx->id
+	@param[in]	trx	transaction */
+  void vers_make_update(const trx_t *trx)
+  {
+    vers_update_fields(trx, table->vers_start);
+  }
+
+  /** Prepare update vector for versioned delete.
+  Set row_end to CURRENT_TIMESTAMP or trx->id.
+  Initialize fts_next_doc_id for versioned delete.
+  @param[in] trx transaction */
+  void vers_make_delete(trx_t *trx);
+};
+
+#define	UPD_NODE_MAGIC_N	1579975
+
+/* Node execution states */
+#define UPD_NODE_SET_IX_LOCK	   1	/* execution came to the node from
+					a node above and if the field
+					has_clust_rec_x_lock is FALSE, we
+					should set an intention x-lock on
+					the table */
+#define UPD_NODE_UPDATE_CLUSTERED  2	/* clustered index record should be
+					updated */
+#define UPD_NODE_INSERT_CLUSTERED  3	/* clustered index record should be
+					inserted, old record is already delete
+					marked */
+#define UPD_NODE_UPDATE_ALL_SEC	   5	/* an ordering field of the clustered
+					index record was changed, or this is
+					a delete operation: should update
+					all the secondary index records */
+#define UPD_NODE_UPDATE_SOME_SEC   6	/* secondary index entries should be
+					looked at and updated if an ordering
+					field changed */
+
+/* Compilation info flags: these must fit within 3 bits; see trx0rec.h */
+#define UPD_NODE_NO_ORD_CHANGE	1	/* no secondary index record will be
+					changed in the update and no ordering
+					field of the clustered index */
+#define UPD_NODE_NO_SIZE_CHANGE	2	/* no record field size will be
+					changed in the update */
+
+
+#include "row0upd.inl"
+
+#endif
diff --git a/storage/innobase/include/row0upd.inl b/storage/innobase/include/row0upd.inl
new file mode 100644
index 00000000..13aacf3f
--- /dev/null
+++ b/storage/innobase/include/row0upd.inl
@@ -0,0 +1,153 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0upd.ic
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0log.h"
+#include "trx0trx.h"
+#include "trx0undo.h"
+#include "row0row.h"
+#include "lock0lock.h"
+#include "page0zip.h"
+
+/*********************************************************************//**
+Creates an update vector object.
+@return own: update vector object */
+UNIV_INLINE
+upd_t*
+upd_create(
+/*=======*/
+	ulint		n,	/*!< in: number of fields */
+	mem_heap_t*	heap)	/*!< in: heap from which memory allocated */
+{
+	upd_t*	update;
+
+	update = static_cast<upd_t*>(mem_heap_zalloc(
+			heap, sizeof(upd_t) + sizeof(upd_field_t) * n));
+
+	update->n_fields = n;
+	update->fields = reinterpret_cast<upd_field_t*>(&update[1]);
+	update->heap = heap;
+
+	return(update);
+}
+
+/*********************************************************************//**
+Returns the number of fields in the update vector == number of columns
+to be updated by an update vector.
+@return number of fields */
+UNIV_INLINE
+ulint
+upd_get_n_fields(
+/*=============*/
+	const upd_t*	update)	/*!< in: update vector */
+{
+	ut_ad(update);
+
+	return(update->n_fields);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the nth field of an update vector.
+@return update vector field */
+UNIV_INLINE
+upd_field_t*
+upd_get_nth_field(
+/*==============*/
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		n)	/*!< in: field position in update vector */
+{
+	ut_ad(update);
+	ut_ad(n < update->n_fields);
+
+	return((upd_field_t*) update->fields + n);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Sets an index field number to be updated by an update vector field. */
+UNIV_INLINE
+void
+upd_field_set_field_no(
+/*===================*/
+	upd_field_t*	upd_field,	/*!< in: update vector field */
+	uint16_t	field_no,	/*!< in: field number in a clustered
+					index */
+	dict_index_t*	index)		/*!< in: index */
+{
+	upd_field->field_no = field_no;
+	upd_field->orig_len = 0;
+	dict_col_copy_type(dict_index_get_nth_col(index, field_no),
+			   dfield_get_type(&upd_field->new_val));
+}
+
+/** set field number to a update vector field, marks this field is updated.
+@param[in,out]	upd_field	update vector field
+@param[in]	field_no	virtual column sequence num
+@param[in]	index		index */
+UNIV_INLINE
+void
+upd_field_set_v_field_no(
+	upd_field_t*	upd_field,
+	uint16_t	field_no,
+	dict_index_t*	index)
+{
+	ut_a(field_no < dict_table_get_n_v_cols(index->table));
+	upd_field->field_no = field_no;
+	upd_field->orig_len = 0;
+
+	dict_col_copy_type(&dict_table_get_nth_v_col(
+				index->table, field_no)->m_col,
+			   dfield_get_type(&upd_field->new_val));
+}
+
+/*********************************************************************//**
+Returns a field of an update vector by field_no.
+@return update vector field, or NULL */
+UNIV_INLINE
+const upd_field_t*
+upd_get_field_by_field_no(
+/*======================*/
+	const upd_t*	update,	/*!< in: update vector */
+	uint16_t	no,	/*!< in: field_no */
+	bool		is_virtual) /*!< in: if it is virtual column */
+{
+	ulint	i;
+	for (i = 0; i < upd_get_n_fields(update); i++) {
+		const upd_field_t*	uf = upd_get_nth_field(update, i);
+
+		/* matches only if the field matches that of is_virtual */
+		if ((!is_virtual) != (!upd_fld_is_virtual_col(uf))) {
+			continue;
+		}
+
+		if (uf->field_no == no) {
+
+			return(uf);
+		}
+	}
+
+	return(NULL);
+}
diff --git a/storage/innobase/include/row0vers.h b/storage/innobase/include/row0vers.h
new file mode 100644
index 00000000..60f310e1
--- /dev/null
+++ b/storage/innobase/include/row0vers.h
@@ -0,0 +1,143 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0vers.h
+Row versions
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0vers_h
+#define row0vers_h
+
+#include "data0data.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "rem0types.h"
+#include "mtr0mtr.h"
+#include "dict0mem.h"
+#include "row0types.h"
+
+// Forward declaration
+class ReadView;
+
+/** Determine if an active transaction has inserted or modified a secondary
+index record.
+@param[in,out]	caller_trx	trx of current thread
+@param[in]	rec	secondary index record
+@param[in]	index	secondary index
+@param[in]	offsets	rec_get_offsets(rec, index)
+@return	the active transaction; state must be rechecked after
+acquiring trx->mutex, and trx->release_reference() must be invoked
+@retval	NULL if the record was committed */
+trx_t*
+row_vers_impl_x_locked(
+	trx_t*		caller_trx,
+	const rec_t*	rec,
+	dict_index_t*	index,
+	const rec_offs*	offsets);
+
+/** Finds out if a version of the record, where the version >= the current
+purge_sys.view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry == ientry; exactly in
+this case we return TRUE.
+@param[in]	also_curr	TRUE if also rec is included in the versions
+				to search; otherwise only versions prior
+				to it are searched
+@param[in]	rec		record in the clustered index; the caller
+				must have a latch on the page
+@param[in]	mtr		mtr holding the latch on rec; it will
+				also hold the latch on purge_view
+@param[in]	index		secondary index
+@param[in]	ientry		secondary index entry
+@param[in]	roll_ptr	roll_ptr for the purge record
+@param[in]	trx_id		transaction ID on the purging record
+@return TRUE if earlier version should have */
+bool
+row_vers_old_has_index_entry(
+	bool			also_curr,
+	const rec_t*		rec,
+	mtr_t*			mtr,
+	dict_index_t*		index,
+	const dtuple_t*		ientry,
+	roll_ptr_t		roll_ptr,
+	trx_id_t		trx_id);
+
+/*****************************************************************//**
+Constructs the version of a clustered index record which a consistent
+read should see. We assume that the trx id stored in rec is such that
+the consistent read should not see rec in its present version.
+@return error code
+@retval DB_SUCCESS if a previous version was fetched
+@retval DB_MISSING_HISTORY if the history is missing (a sign of corruption) */
+dberr_t
+row_vers_build_for_consistent_read(
+/*===============================*/
+	const rec_t*	rec,	/*!< in: record in a clustered index; the
+				caller must have a latch on the page; this
+				latch locks the top of the stack of versions
+				of this records */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec; it will
+				also hold the latch on purge_view */
+	dict_index_t*	index,	/*!< in: the clustered index */
+	rec_offs**	offsets,/*!< in/out: offsets returned by
+				rec_get_offsets(rec, index) */
+	ReadView*	view,	/*!< in: the consistent read view */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	mem_heap_t*	in_heap,/*!< in: memory heap from which the memory for
+				*old_vers is allocated; memory for possible
+				intermediate versions is allocated and freed
+				locally within the function */
+	rec_t**		old_vers,/*!< out, own: old version, or NULL
+				if the history is missing or the record
+				does not exist in the view, that is,
+				it was freshly inserted afterwards */
+	dtuple_t**	vrow);	/*!< out: reports virtual column info if any */
+
+/*****************************************************************//**
+Constructs the last committed version of a clustered index record,
+which should be seen by a semi-consistent read. */
+void
+row_vers_build_for_semi_consistent_read(
+/*====================================*/
+	trx_t*		caller_trx,/*!<in/out: trx of current thread */
+	const rec_t*	rec,	/*!< in: record in a clustered index; the
+				caller must have a latch on the page; this
+				latch locks the top of the stack of versions
+				of this records */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec */
+	dict_index_t*	index,	/*!< in: the clustered index */
+	rec_offs**	offsets,/*!< in/out: offsets returned by
+				rec_get_offsets(rec, index) */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	mem_heap_t*	in_heap,/*!< in: memory heap from which the memory for
+				*old_vers is allocated; memory for possible
+				intermediate versions is allocated and freed
+				locally within the function */
+	const rec_t**	old_vers,/*!< out: rec, old version, or NULL if the
+				record does not exist in the view, that is,
+				it was freshly inserted afterwards */
+	dtuple_t**	vrow);	/*!< out: holds virtual column info if any
+				is updated in the view */
+
+#endif
diff --git a/storage/innobase/include/rw_lock.h b/storage/innobase/include/rw_lock.h
new file mode 100644
index 00000000..4881f2f1
--- /dev/null
+++ b/storage/innobase/include/rw_lock.h
@@ -0,0 +1,138 @@
+/*****************************************************************************
+
+Copyright (c) 2020, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+#include <atomic>
+#include "my_dbug.h"
+
+/** Simple read-write lock based on std::atomic */
+class rw_lock
+{
+  /** The lock word */
+  std::atomic<uint32_t> lock;
+
+protected:
+  /** Available lock */
+  static constexpr uint32_t UNLOCKED= 0;
+  /** Flag to indicate that write_lock() is being held */
+  static constexpr uint32_t WRITER= 1U << 31;
+  /** Flag to indicate that write_lock_wait() is pending */
+  static constexpr uint32_t WRITER_WAITING= 1U << 30;
+  /** Flag to indicate that write_lock() or write_lock_wait() is pending */
+  static constexpr uint32_t WRITER_PENDING= WRITER | WRITER_WAITING;
+
+  /** Start waiting for an exclusive lock. */
+  void write_lock_wait_start()
+  {
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    static_assert(WRITER_WAITING == 1U << 30, "compatibility");
+    __asm__ __volatile__("lock btsl $30, %0" : "+m" (lock));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+    static_assert(WRITER_WAITING == 1U << 30, "compatibility");
+    _interlockedbittestandset(reinterpret_cast<volatile long*>(&lock), 30);
+#else
+    lock.fetch_or(WRITER_WAITING, std::memory_order_relaxed);
+#endif
+  }
+  /** Start waiting for an exclusive lock.
+  @return current value of the lock word */
+  uint32_t write_lock_wait_start_read()
+  { return lock.fetch_or(WRITER_WAITING, std::memory_order_relaxed); }
+  /** Wait for an exclusive lock.
+  @param l the value of the lock word
+  @return whether the exclusive lock was acquired */
+  bool write_lock_wait_try(uint32_t &l)
+  {
+    return lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire,
+                                        std::memory_order_relaxed);
+  }
+  /** Try to acquire a shared lock.
+  @param l the value of the lock word
+  @return whether the lock was acquired */
+  bool read_trylock(uint32_t &l)
+  {
+    l= UNLOCKED;
+    while (!lock.compare_exchange_strong(l, l + 1, std::memory_order_acquire,
+                                         std::memory_order_relaxed))
+    {
+      DBUG_ASSERT(!(WRITER & l) || !(~WRITER_PENDING & l));
+      if (l & WRITER_PENDING)
+        return false;
+    }
+    return true;
+  }
+
+  /** Wait for an exclusive lock.
+  @return whether the exclusive lock was acquired */
+  bool write_lock_poll()
+  {
+    auto l= WRITER_WAITING;
+    if (write_lock_wait_try(l))
+      return true;
+    if (!(l & WRITER_WAITING))
+      /* write_lock() must have succeeded for another thread */
+      write_lock_wait_start();
+    return false;
+  }
+  /** @return the lock word value */
+  uint32_t value() const { return lock.load(std::memory_order_acquire); }
+
+public:
+  /** Default constructor */
+  rw_lock() : lock(UNLOCKED) {}
+
+  /** Release a shared lock.
+  @return whether any writers may have to be woken up */
+  bool read_unlock()
+  {
+    auto l= lock.fetch_sub(1, std::memory_order_release);
+    DBUG_ASSERT(!(l & WRITER)); /* no write lock must have existed */
+    DBUG_ASSERT(~(WRITER_PENDING) & l); /* at least one read lock */
+    return (~WRITER_PENDING & l) == 1;
+  }
+  /** Release an exclusive lock */
+  void write_unlock()
+  {
+    /* Below, we use fetch_sub(WRITER) instead of fetch_and(~WRITER).
+    The reason is that on IA-32 and AMD64 it translates into the 80486
+    instruction LOCK XADD, while fetch_and() translates into a loop
+    around LOCK CMPXCHG. For other ISA either form should be fine. */
+    static_assert(WRITER == 1U << 31, "compatibility");
+    IF_DBUG_ASSERT(auto l=,) lock.fetch_sub(WRITER, std::memory_order_release);
+    /* the write lock must have existed */
+    DBUG_ASSERT(l & WRITER);
+  }
+  /** Try to acquire a shared lock.
+  @return whether the lock was acquired */
+  bool read_trylock() { uint32_t l; return read_trylock(l); }
+  /** Try to acquire an exclusive lock.
+  @return whether the lock was acquired */
+  bool write_trylock()
+  {
+    auto l= UNLOCKED;
+    return lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire,
+                                        std::memory_order_relaxed);
+  }
+
+  /** @return whether an exclusive lock is being held by any thread */
+  bool is_write_locked() const { return !!(value() & WRITER); }
+  /** @return whether any lock is being held or waited for by any thread */
+  bool is_locked_or_waiting() const { return value() != 0; }
+  /** @return whether any lock is being held by any thread */
+  bool is_locked() const { return (value() & ~WRITER_WAITING) != 0; }
+};
diff --git a/storage/innobase/include/small_vector.h b/storage/innobase/include/small_vector.h
new file mode 100644
index 00000000..d28a3618
--- /dev/null
+++ b/storage/innobase/include/small_vector.h
@@ -0,0 +1,100 @@
+/*****************************************************************************
+
+Copyright (c) 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+/* A normally small vector, inspired by llvm::SmallVector */
+#include "my_global.h"
+#include <iterator>
+#include <memory>
+
+class small_vector_base
+{
+protected:
+  typedef uint32_t Size_T;
+  void *BeginX;
+  Size_T Size= 0, Capacity;
+  small_vector_base()= delete;
+  small_vector_base(void *small, size_t small_size)
+    : BeginX(small), Capacity(Size_T(small_size)) {}
+  ATTRIBUTE_COLD void grow_by_1(void *small, size_t element_size);
+public:
+  size_t size() const { return Size; }
+  size_t capacity() const { return Capacity; }
+  bool empty() const { return !Size; }
+  void clear() { Size= 0; }
+protected:
+  void set_size(size_t N) { Size= Size_T(N); }
+};
+
+template <typename T, unsigned N>
+class small_vector : public small_vector_base
+{
+  /** The fixed storage allocation */
+  T small[N];
+
+  using small_vector_base::set_size;
+
+  void grow_if_needed()
+  {
+    if (unlikely(size() >= capacity()))
+      grow_by_1(small, sizeof *small);
+  }
+
+public:
+  small_vector() : small_vector_base(small, N)
+  {
+    TRASH_ALLOC(small, sizeof small);
+  }
+  ~small_vector()
+  {
+    if (small != begin())
+      my_free(begin());
+    MEM_MAKE_ADDRESSABLE(small, sizeof small);
+  }
+
+  using iterator= T *;
+  using const_iterator= const T *;
+  using reverse_iterator= std::reverse_iterator<iterator>;
+  using reference= T &;
+  using const_reference= const T&;
+
+  iterator begin() { return static_cast<iterator>(BeginX); }
+  const_iterator begin() const { return static_cast<const_iterator>(BeginX); }
+  iterator end() { return begin() + size(); }
+  const_iterator end() const { return begin() + size(); }
+
+  reverse_iterator rbegin() { return reverse_iterator(end()); }
+  reverse_iterator rend() { return reverse_iterator(begin()); }
+
+  reference operator[](size_t i) { assert(i < size()); return begin()[i]; }
+  const_reference operator[](size_t i) const
+  { return const_cast<small_vector&>(*this)[i]; }
+
+  void erase(const_iterator S, const_iterator E)
+  {
+    set_size(std::move(const_cast<iterator>(E), end(),
+                       const_cast<iterator>(S)) - begin());
+  }
+
+  void emplace_back(T &&arg)
+  {
+    grow_if_needed();
+    ::new (end()) T(arg);
+    set_size(size() + 1);
+  }
+};
diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h
new file mode 100644
index 00000000..51f3049b
--- /dev/null
+++ b/storage/innobase/include/srv0mon.h
@@ -0,0 +1,846 @@
+/***********************************************************************
+
+Copyright (c) 2010, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file include/srv0mon.h
+Server monitor counter related defines
+
+Created 12/15/2009	Jimmy Yang
+*******************************************************/
+
+#ifndef srv0mon_h
+#define srv0mon_h
+
+#include "univ.i"
+
+#ifndef __STDC_LIMIT_MACROS
+/* Required for FreeBSD so that INT64_MAX is defined. */
+#define __STDC_LIMIT_MACROS
+#endif /* __STDC_LIMIT_MACROS */
+
+#include <cstdint>
+#include "my_atomic.h"
+#include "my_atomic_wrapper.h"
+
+/** Possible status values for "mon_status" in "struct monitor_value" */
+enum monitor_running_status {
+	MONITOR_STARTED = 1,	/*!< Monitor has been turned on */
+	MONITOR_STOPPED = 2	/*!< Monitor has been turned off */
+};
+
+typedef enum monitor_running_status	monitor_running_t;
+
+/** Monitor counter value type */
+typedef	int64_t				mon_type_t;
+
+/** Two monitor structures are defined in this file. One is
+"monitor_value_t" which contains dynamic counter values for each
+counter. The other is "monitor_info_t", which contains
+static information (counter name, desc etc.) for each counter.
+In addition, an enum datatype "monitor_id_t" is also defined,
+it identifies each monitor with an internally used symbol, whose
+integer value indexes into above two structure for its dynamic
+and static information.
+Developer who intend to add new counters would require to
+fill in counter information as described in "monitor_info_t" and
+create the internal counter ID in "monitor_id_t". */
+
+/** Structure containing the actual values of a monitor counter. */
+struct monitor_value_t {
+	time_t	mon_start_time;	/*!< Start time of monitoring  */
+	time_t	mon_stop_time;	/*!< Stop time of monitoring */
+	time_t	mon_reset_time;	/*!< Time of resetting the counter */
+	mon_type_t	mon_value;	/*!< Current counter Value */
+	mon_type_t	mon_max_value;	/*!< Current Max value */
+	mon_type_t	mon_min_value;	/*!< Current Min value */
+	mon_type_t	mon_value_reset;/*!< value at last reset */
+	mon_type_t	mon_max_value_start; /*!< Max value since start */
+	mon_type_t	mon_min_value_start; /*!< Min value since start */
+	mon_type_t	mon_start_value;/*!< Value at the start time */
+	mon_type_t	mon_last_value;	/*!< Last set of values */
+	monitor_running_t mon_status;	/* whether monitor still running */
+};
+
+/** Follwoing defines are possible values for "monitor_type" field in
+"struct monitor_info" */
+enum monitor_type_t {
+	MONITOR_NONE = 0,	/*!< No monitoring */
+	MONITOR_MODULE = 1,	/*!< This is a monitor module type,
+				not a counter */
+	MONITOR_EXISTING = 2,	/*!< The monitor carries information from
+				an existing system status variable */
+	MONITOR_NO_AVERAGE = 4,	/*!< Set this status if we don't want to
+				calculate the average value for the counter */
+	MONITOR_DISPLAY_CURRENT = 8, /*!< Display current value of the
+				counter, rather than incremental value
+				over the period. Mostly for counters
+				displaying current resource usage */
+	MONITOR_GROUP_MODULE = 16, /*!< Monitor can be turned on/off
+				only as a module, but not individually */
+	MONITOR_DEFAULT_ON = 32,/*!< Monitor will be turned on by default at
+				server start up */
+	MONITOR_SET_OWNER = 64,	/*!< Owner of "monitor set", a set of
+				monitor counters */
+	MONITOR_SET_MEMBER = 128,/*!< Being part of a "monitor set" */
+	MONITOR_HIDDEN = 256	/*!< Do not display this monitor in the
+				metrics table */
+};
+
+/** Counter minimum value is initialized to be max value of
+ mon_type_t (int64_t) */
+#ifndef INT64_MAX
+#define INT64_MAX		(9223372036854775807LL)
+#endif
+#ifndef INT64_MIN
+#define INT64_MIN		(-9223372036854775807LL-1)
+#endif
+#define	MIN_RESERVED		INT64_MAX
+#define	MAX_RESERVED		INT64_MIN
+
+/** This enumeration defines internal monitor identifier used internally
+to identify each particular counter. Its value indexes into two arrays,
+one is the "innodb_counter_value" array which records actual monitor
+counter values, the other is "innodb_counter_info" array which describes
+each counter's basic information (name, desc etc.). A couple of
+naming rules here:
+1) If the monitor defines a module, it starts with MONITOR_MODULE
+2) If the monitor uses exisitng counters from "status variable", its ID
+name shall start with MONITOR_OVLD
+
+Please refer to "innodb_counter_info" in srv/srv0mon.cc for detail
+information for each monitor counter */
+
+enum monitor_id_t {
+	/* This is to identify the default value set by the metrics
+	control global variables */
+	MONITOR_DEFAULT_START = 0,
+
+	/* Start of Metadata counter */
+	MONITOR_MODULE_METADATA,
+	MONITOR_TABLE_OPEN,
+
+	/* Lock manager related counters */
+	MONITOR_MODULE_LOCK,
+	MONITOR_DEADLOCK,
+	MONITOR_TIMEOUT,
+	MONITOR_LOCKREC_WAIT,
+	MONITOR_TABLELOCK_WAIT,
+	MONITOR_NUM_RECLOCK_REQ,
+	MONITOR_RECLOCK_CREATED,
+	MONITOR_RECLOCK_REMOVED,
+	MONITOR_NUM_RECLOCK,
+	MONITOR_TABLELOCK_CREATED,
+	MONITOR_TABLELOCK_REMOVED,
+	MONITOR_NUM_TABLELOCK,
+	MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT,
+	MONITOR_OVLD_LOCK_WAIT_TIME,
+	MONITOR_OVLD_LOCK_MAX_WAIT_TIME,
+	MONITOR_OVLD_ROW_LOCK_WAIT,
+	MONITOR_OVLD_LOCK_AVG_WAIT_TIME,
+
+	/* Buffer and I/O realted counters. */
+	MONITOR_MODULE_BUFFER,
+	MONITOR_OVLD_BUFFER_POOL_SIZE,
+	MONITOR_OVLD_BUF_POOL_READS,
+	MONITOR_OVLD_BUF_POOL_READ_REQUESTS,
+	MONITOR_OVLD_BUF_POOL_WRITE_REQUEST,
+	MONITOR_OVLD_BUF_POOL_WAIT_FREE,
+	MONITOR_OVLD_BUF_POOL_READ_AHEAD,
+	MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED,
+	MONITOR_OVLD_BUF_POOL_PAGE_TOTAL,
+	MONITOR_OVLD_BUF_POOL_PAGE_MISC,
+	MONITOR_OVLD_BUF_POOL_PAGES_DATA,
+	MONITOR_OVLD_BUF_POOL_BYTES_DATA,
+	MONITOR_OVLD_BUF_POOL_PAGES_DIRTY,
+	MONITOR_OVLD_BUF_POOL_BYTES_DIRTY,
+	MONITOR_OVLD_BUF_POOL_PAGES_FREE,
+	MONITOR_OVLD_PAGE_CREATED,
+	MONITOR_OVLD_PAGES_WRITTEN,
+	MONITOR_OVLD_PAGES_READ,
+	MONITOR_OVLD_BYTE_READ,
+	MONITOR_OVLD_BYTE_WRITTEN,
+	MONITOR_FLUSH_BATCH_SCANNED,
+	MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+	MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
+	MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+	MONITOR_FLUSH_BATCH_COUNT,
+	MONITOR_FLUSH_BATCH_PAGES,
+	MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+	MONITOR_FLUSH_NEIGHBOR_COUNT,
+	MONITOR_FLUSH_NEIGHBOR_PAGES,
+	MONITOR_FLUSH_N_TO_FLUSH_REQUESTED,
+
+	MONITOR_FLUSH_N_TO_FLUSH_BY_AGE,
+	MONITOR_FLUSH_ADAPTIVE_AVG_TIME,
+
+	MONITOR_FLUSH_ADAPTIVE_AVG_PASS,
+
+	MONITOR_LRU_GET_FREE_LOOPS,
+	MONITOR_LRU_GET_FREE_WAITS,
+
+	MONITOR_FLUSH_AVG_PAGE_RATE,
+	MONITOR_FLUSH_LSN_AVG_RATE,
+	MONITOR_FLUSH_PCT_FOR_DIRTY,
+	MONITOR_FLUSH_PCT_FOR_LSN,
+	MONITOR_FLUSH_SYNC_WAITS,
+	MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+	MONITOR_FLUSH_ADAPTIVE_COUNT,
+	MONITOR_FLUSH_ADAPTIVE_PAGES,
+	MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+	MONITOR_FLUSH_SYNC_COUNT,
+	MONITOR_FLUSH_SYNC_PAGES,
+	MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+	MONITOR_FLUSH_BACKGROUND_COUNT,
+	MONITOR_FLUSH_BACKGROUND_PAGES,
+	MONITOR_LRU_BATCH_SCANNED,
+	MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+	MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+	MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
+	MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+	MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT,
+	MONITOR_LRU_GET_FREE_SEARCH,
+	MONITOR_LRU_SEARCH_SCANNED,
+	MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+	MONITOR_LRU_SEARCH_SCANNED_PER_CALL,
+	MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+	MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+	MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL,
+
+	/* Buffer Page I/O specific counters. */
+	MONITOR_MODULE_BUF_PAGE,
+	MONITOR_INDEX_LEAF_PAGE_READ,
+	MONITOR_INDEX_NON_LEAF_PAGE_READ,
+	MONITOR_INDEX_IBUF_LEAF_PAGE_READ,
+	MONITOR_INDEX_IBUF_NON_LEAF_PAGE_READ,
+	MONITOR_UNDO_LOG_PAGE_READ,
+	MONITOR_INODE_PAGE_READ,
+	MONITOR_IBUF_FREELIST_PAGE_READ,
+	MONITOR_IBUF_BITMAP_PAGE_READ,
+	MONITOR_SYSTEM_PAGE_READ,
+	MONITOR_TRX_SYSTEM_PAGE_READ,
+	MONITOR_FSP_HDR_PAGE_READ,
+	MONITOR_XDES_PAGE_READ,
+	MONITOR_BLOB_PAGE_READ,
+	MONITOR_ZBLOB_PAGE_READ,
+	MONITOR_ZBLOB2_PAGE_READ,
+	MONITOR_OTHER_PAGE_READ,
+	MONITOR_INDEX_LEAF_PAGE_WRITTEN,
+	MONITOR_INDEX_NON_LEAF_PAGE_WRITTEN,
+	MONITOR_INDEX_IBUF_LEAF_PAGE_WRITTEN,
+	MONITOR_INDEX_IBUF_NON_LEAF_PAGE_WRITTEN,
+	MONITOR_UNDO_LOG_PAGE_WRITTEN,
+	MONITOR_INODE_PAGE_WRITTEN,
+	MONITOR_IBUF_FREELIST_PAGE_WRITTEN,
+	MONITOR_IBUF_BITMAP_PAGE_WRITTEN,
+	MONITOR_SYSTEM_PAGE_WRITTEN,
+	MONITOR_TRX_SYSTEM_PAGE_WRITTEN,
+	MONITOR_FSP_HDR_PAGE_WRITTEN,
+	MONITOR_XDES_PAGE_WRITTEN,
+	MONITOR_BLOB_PAGE_WRITTEN,
+	MONITOR_ZBLOB_PAGE_WRITTEN,
+	MONITOR_ZBLOB2_PAGE_WRITTEN,
+	MONITOR_OTHER_PAGE_WRITTEN,
+
+	/* OS level counters (I/O) */
+	MONITOR_MODULE_OS,
+	MONITOR_OVLD_OS_FILE_READ,
+	MONITOR_OVLD_OS_FILE_WRITE,
+	MONITOR_OVLD_OS_FSYNC,
+	MONITOR_OS_PENDING_READS,
+	MONITOR_OS_PENDING_WRITES,
+	MONITOR_OVLD_OS_LOG_WRITTEN,
+
+	/* Transaction related counters */
+	MONITOR_MODULE_TRX,
+	MONITOR_TRX_RW_COMMIT,
+	MONITOR_TRX_RO_COMMIT,
+	MONITOR_TRX_NL_RO_COMMIT,
+	MONITOR_TRX_COMMIT_UNDO,
+	MONITOR_TRX_ROLLBACK,
+	MONITOR_TRX_ROLLBACK_SAVEPOINT,
+	MONITOR_RSEG_HISTORY_LEN,
+	MONITOR_NUM_UNDO_SLOT_USED,
+	MONITOR_NUM_UNDO_SLOT_CACHED,
+	MONITOR_RSEG_CUR_SIZE,
+
+	/* Purge related counters */
+	MONITOR_MODULE_PURGE,
+	MONITOR_N_DEL_ROW_PURGE,
+	MONITOR_N_UPD_EXIST_EXTERN,
+	MONITOR_PURGE_INVOKED,
+	MONITOR_PURGE_N_PAGE_HANDLED,
+	MONITOR_DML_PURGE_DELAY,
+	MONITOR_PURGE_STOP_COUNT,
+	MONITOR_PURGE_RESUME_COUNT,
+
+	/* Recovery related counters */
+	MONITOR_MODULE_RECOVERY,
+	MONITOR_OVLD_CHECKPOINTS,
+	MONITOR_OVLD_LSN_FLUSHDISK,
+	MONITOR_OVLD_LSN_CHECKPOINT,
+	MONITOR_OVLD_LSN_CURRENT,
+	MONITOR_LSN_CHECKPOINT_AGE,
+	MONITOR_OVLD_BUF_OLDEST_LSN,
+	MONITOR_OVLD_MAX_AGE_ASYNC,
+	MONITOR_OVLD_LOG_WAITS,
+	MONITOR_OVLD_LOG_WRITE_REQUEST,
+	MONITOR_OVLD_LOG_WRITES,
+
+	/* Page Manager related counters */
+	MONITOR_MODULE_PAGE,
+	MONITOR_PAGE_COMPRESS,
+	MONITOR_PAGE_DECOMPRESS,
+	MONITOR_PAD_INCREMENTS,
+	MONITOR_PAD_DECREMENTS,
+	/* New monitor variables for page compression */
+	MONITOR_OVLD_PAGE_COMPRESS_SAVED,
+	MONITOR_OVLD_PAGES_PAGE_COMPRESSED,
+	MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP,
+	MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED,
+	MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR,
+
+	/* New monitor variables for page encryption */
+	MONITOR_OVLD_PAGES_ENCRYPTED,
+	MONITOR_OVLD_PAGES_DECRYPTED,
+
+	/* Index related counters */
+	MONITOR_MODULE_INDEX,
+	MONITOR_INDEX_SPLIT,
+	MONITOR_INDEX_MERGE_ATTEMPTS,
+	MONITOR_INDEX_MERGE_SUCCESSFUL,
+	MONITOR_INDEX_REORG_ATTEMPTS,
+	MONITOR_INDEX_REORG_SUCCESSFUL,
+	MONITOR_INDEX_DISCARD,
+
+#ifdef BTR_CUR_HASH_ADAPT
+	/* Adaptive Hash Index related counters */
+	MONITOR_MODULE_ADAPTIVE_HASH,
+	MONITOR_OVLD_ADAPTIVE_HASH_SEARCH,
+	MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE,
+	MONITOR_ADAPTIVE_HASH_PAGE_ADDED,
+	MONITOR_ADAPTIVE_HASH_PAGE_REMOVED,
+	MONITOR_ADAPTIVE_HASH_ROW_ADDED,
+	MONITOR_ADAPTIVE_HASH_ROW_REMOVED,
+	MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND,
+	MONITOR_ADAPTIVE_HASH_ROW_UPDATED,
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	/* Tablespace related counters */
+	MONITOR_MODULE_FIL_SYSTEM,
+	MONITOR_OVLD_N_FILE_OPENED,
+
+	/* InnoDB Change Buffer related counters */
+	MONITOR_MODULE_IBUF_SYSTEM,
+	MONITOR_OVLD_IBUF_MERGE_INSERT,
+	MONITOR_OVLD_IBUF_MERGE_DELETE,
+	MONITOR_OVLD_IBUF_MERGE_PURGE,
+	MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT,
+	MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE,
+	MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE,
+	MONITOR_OVLD_IBUF_MERGES,
+	MONITOR_OVLD_IBUF_SIZE,
+
+	/* Counters for server operations */
+	MONITOR_MODULE_SERVER,
+	MONITOR_MASTER_THREAD_SLEEP,
+	MONITOR_OVLD_SERVER_ACTIVITY,
+	MONITOR_MASTER_ACTIVE_LOOPS,
+	MONITOR_MASTER_IDLE_LOOPS,
+	MONITOR_SRV_LOG_FLUSH_MICROSECOND,
+	MONITOR_SRV_DICT_LRU_MICROSECOND,
+	MONITOR_SRV_DICT_LRU_EVICT_COUNT_ACTIVE,
+	MONITOR_SRV_DICT_LRU_EVICT_COUNT_IDLE,
+	MONITOR_OVLD_SRV_DBLWR_WRITES,
+	MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN,
+	MONITOR_OVLD_SRV_PAGE_SIZE,
+
+	/* Data DDL related counters */
+	MONITOR_MODULE_DDL_STATS,
+	MONITOR_BACKGROUND_DROP_INDEX,
+	MONITOR_ONLINE_CREATE_INDEX,
+	MONITOR_PENDING_ALTER_TABLE,
+	MONITOR_ALTER_TABLE_SORT_FILES,
+	MONITOR_ALTER_TABLE_LOG_FILES,
+
+	MONITOR_MODULE_ICP,
+	MONITOR_ICP_ATTEMPTS,
+	MONITOR_ICP_NO_MATCH,
+	MONITOR_ICP_OUT_OF_RANGE,
+	MONITOR_ICP_MATCH,
+
+	/* This is used only for control system to turn
+	on/off and reset all monitor counters */
+	MONITOR_ALL_COUNTER,
+
+	/* This must be the last member */
+	NUM_MONITOR
+};
+
+/** This informs the monitor control system to turn
+on/off and reset monitor counters through wild card match */
+#define	MONITOR_WILDCARD_MATCH		(NUM_MONITOR + 1)
+
+/** Cannot find monitor counter with a specified name */
+#define	MONITOR_NO_MATCH		(NUM_MONITOR + 2)
+
+/** struct monitor_info describes the basic/static information
+about each monitor counter. */
+struct monitor_info_t {
+	const char*	monitor_name;	/*!< Monitor name */
+	const char*	monitor_module;	/*!< Sub Module the monitor
+					belongs to */
+	const char*	monitor_desc;	/*!< Brief desc of monitor counter */
+	monitor_type_t	monitor_type;	/*!< Type of Monitor Info */
+	monitor_id_t	monitor_related_id;/*!< Monitor ID of counter that
+					related to this monitor. This is
+					set when the monitor belongs to
+					a "monitor set" */
+	monitor_id_t	monitor_id;	/*!< Monitor ID as defined in enum
+					monitor_id_t */
+};
+
+/** Following are the "set_option" values allowed for
+srv_mon_process_existing_counter() and srv_mon_process_existing_counter()
+functions. To turn on/off/reset the monitor counters. */
+enum mon_option_t {
+	MONITOR_TURN_ON = 1,		/*!< Turn on the counter */
+	MONITOR_TURN_OFF,		/*!< Turn off the counter */
+	MONITOR_RESET_VALUE,		/*!< Reset current values */
+	MONITOR_RESET_ALL_VALUE,	/*!< Reset all values */
+	MONITOR_GET_VALUE		/*!< Option for
+					srv_mon_process_existing_counter()
+					function */
+};
+
+/** Number of bit in a ulint datatype */
+#define	NUM_BITS_ULINT	(sizeof(ulint) * CHAR_BIT)
+
+/** This "monitor_set_tbl" is a bitmap records whether a particular monitor
+counter has been turned on or off */
+extern Atomic_relaxed<ulint>
+    monitor_set_tbl[(NUM_MONITOR + NUM_BITS_ULINT - 1) / NUM_BITS_ULINT];
+
+/** Macros to turn on/off the control bit in monitor_set_tbl for a monitor
+counter option. */
+#define MONITOR_ON(monitor)                                                   \
+  (monitor_set_tbl[unsigned(monitor) / NUM_BITS_ULINT].fetch_or(              \
+      (ulint(1) << (unsigned(monitor) % NUM_BITS_ULINT))))
+
+#define MONITOR_OFF(monitor)                                                  \
+  (monitor_set_tbl[unsigned(monitor) / NUM_BITS_ULINT].fetch_and(             \
+      ~(ulint(1) << (unsigned(monitor) % NUM_BITS_ULINT))))
+
+/** Check whether the requested monitor is turned on/off */
+#define MONITOR_IS_ON(monitor)                                                \
+  (monitor_set_tbl[unsigned(monitor) / NUM_BITS_ULINT] &                      \
+   (ulint(1) << (unsigned(monitor) % NUM_BITS_ULINT)))
+
+/** The actual monitor counter array that records each monintor counter
+value */
+extern monitor_value_t	 innodb_counter_value[NUM_MONITOR];
+
+/** Following are macro defines for basic montior counter manipulations.
+Please note we do not provide any synchronization for these monitor
+operations due to performance consideration. Most counters can
+be placed under existing mutex protections in respective code
+module. */
+
+/** Macros to access various fields of a monitor counters */
+#define MONITOR_FIELD(monitor, field)			\
+		(innodb_counter_value[monitor].field)
+
+#define MONITOR_VALUE(monitor)				\
+		MONITOR_FIELD(monitor, mon_value)
+
+#define MONITOR_MAX_VALUE(monitor)			\
+		MONITOR_FIELD(monitor, mon_max_value)
+
+#define MONITOR_MIN_VALUE(monitor)			\
+		MONITOR_FIELD(monitor, mon_min_value)
+
+#define MONITOR_VALUE_RESET(monitor)			\
+		MONITOR_FIELD(monitor, mon_value_reset)
+
+#define MONITOR_MAX_VALUE_START(monitor)		\
+		MONITOR_FIELD(monitor, mon_max_value_start)
+
+#define MONITOR_MIN_VALUE_START(monitor)		\
+		MONITOR_FIELD(monitor, mon_min_value_start)
+
+#define MONITOR_LAST_VALUE(monitor)			\
+		MONITOR_FIELD(monitor, mon_last_value)
+
+#define MONITOR_START_VALUE(monitor)			\
+		MONITOR_FIELD(monitor, mon_start_value)
+
+#define MONITOR_VALUE_SINCE_START(monitor)		\
+		(MONITOR_VALUE(monitor) + MONITOR_VALUE_RESET(monitor))
+
+#define MONITOR_STATUS(monitor)				\
+		MONITOR_FIELD(monitor, mon_status)
+
+#define MONITOR_SET_START(monitor)					\
+	do {								\
+		MONITOR_STATUS(monitor) = MONITOR_STARTED;		\
+		MONITOR_FIELD((monitor), mon_start_time) = time(NULL);	\
+	} while (0)
+
+#define MONITOR_SET_OFF(monitor)					\
+	do {								\
+		MONITOR_STATUS(monitor) = MONITOR_STOPPED;		\
+		MONITOR_FIELD((monitor), mon_stop_time) = time(NULL);	\
+	} while (0)
+
+#define	MONITOR_INIT_ZERO_VALUE		0
+
+/** Max and min values are initialized when we first turn on the monitor
+counter, and set the MONITOR_STATUS. */
+#define MONITOR_MAX_MIN_NOT_INIT(monitor)				\
+		(MONITOR_STATUS(monitor) == MONITOR_INIT_ZERO_VALUE	\
+		 && MONITOR_MIN_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE \
+		 && MONITOR_MAX_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE)
+
+#define MONITOR_INIT(monitor)						\
+	if (MONITOR_MAX_MIN_NOT_INIT(monitor)) {			\
+		MONITOR_MIN_VALUE(monitor) = MIN_RESERVED;		\
+		MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED;	\
+		MONITOR_MAX_VALUE(monitor) = MAX_RESERVED;		\
+		MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED;	\
+	}
+
+/** Macros to increment/decrement the counters. The normal
+monitor counter operation expects appropriate synchronization
+already exists. No additional mutex is necessary when operating
+on the counters */
+#define	MONITOR_INC(monitor)						\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor)++;				\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/** Atomically increment a monitor counter.
+Use MONITOR_INC if appropriate mutex protection exists.
+@param monitor	monitor to be incremented by 1
+@param enabled	whether the monitor is enabled */
+#define MONITOR_ATOMIC_INC_LOW(monitor, enabled)			\
+	if (enabled) {							\
+		ib_uint64_t	value;					\
+		value  = my_atomic_add64_explicit(			\
+			(int64*) &MONITOR_VALUE(monitor), 1,		\
+			MY_MEMORY_ORDER_RELAXED) + 1;			\
+		/* Note: This is not 100% accurate because of the	\
+		inherent race, we ignore it due to performance. */	\
+		if (value > (ib_uint64_t) MONITOR_MAX_VALUE(monitor)) {	\
+			MONITOR_MAX_VALUE(monitor) = value;		\
+		}							\
+	}
+
+/** Atomically decrement a monitor counter.
+Use MONITOR_DEC if appropriate mutex protection exists.
+@param monitor	monitor to be decremented by 1
+@param enabled	whether the monitor is enabled */
+#define MONITOR_ATOMIC_DEC_LOW(monitor, enabled)			\
+	if (enabled) {							\
+		ib_uint64_t	value;					\
+		value = my_atomic_add64_explicit(			\
+			(int64*) &MONITOR_VALUE(monitor), -1,		\
+			MY_MEMORY_ORDER_RELAXED) - 1;			\
+		/* Note: This is not 100% accurate because of the	\
+		inherent race, we ignore it due to performance. */	\
+		if (value < (ib_uint64_t) MONITOR_MIN_VALUE(monitor)) {	\
+			MONITOR_MIN_VALUE(monitor) = value;		\
+		}							\
+	}
+
+/** Atomically increment a monitor counter if it is enabled.
+Use MONITOR_INC if appropriate mutex protection exists.
+@param monitor	monitor to be incremented by 1 */
+#define MONITOR_ATOMIC_INC(monitor)				\
+	MONITOR_ATOMIC_INC_LOW(monitor, MONITOR_IS_ON(monitor))
+/** Atomically decrement a monitor counter if it is enabled.
+Use MONITOR_DEC if appropriate mutex protection exists.
+@param monitor	monitor to be decremented by 1 */
+#define MONITOR_ATOMIC_DEC(monitor)				\
+	MONITOR_ATOMIC_DEC_LOW(monitor, MONITOR_IS_ON(monitor))
+
+#define	MONITOR_DEC(monitor)						\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor)--;				\
+		if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) {  \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+#ifdef HAVE_MEM_CHECK
+# define MONITOR_CHECK_DEFINED(value) do {	\
+    mon_type_t m __attribute__((unused))= value;        \
+	MEM_CHECK_DEFINED(&m, sizeof m);	\
+} while (0)
+#else /* HAVE_MEM_CHECK */
+# define MONITOR_CHECK_DEFINED(value) (void) 0
+#endif /* HAVE_MEM_CHECK */
+
+#define	MONITOR_INC_VALUE(monitor, value)				\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor) += (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+#define	MONITOR_DEC_VALUE(monitor, value)				\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		ut_ad(MONITOR_VALUE(monitor) >= (mon_type_t) (value);	\
+		MONITOR_VALUE(monitor) -= (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) {  \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/* Increment/decrement counter without check the monitor on/off bit, which
+could already be checked as a module group */
+#define	MONITOR_INC_NOCHECK(monitor)					\
+	do {								\
+		MONITOR_VALUE(monitor)++;				\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	} while (0)							\
+
+#define	MONITOR_DEC_NOCHECK(monitor)					\
+	do {								\
+		MONITOR_VALUE(monitor)--;				\
+		if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) {  \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	} while (0)
+
+/** Directly set a monitor counter's value */
+#define	MONITOR_SET(monitor, value)					\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor) = (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+		if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) {  \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/** Add time difference between now and input "value" (in seconds) to the
+monitor counter
+@param monitor monitor to update for the time difference
+@param value the start time value */
+#define	MONITOR_INC_TIME_IN_MICRO_SECS(monitor, value)			\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		uintmax_t	old_time = value;			\
+		value = microsecond_interval_timer();			\
+		MONITOR_VALUE(monitor) += (mon_type_t) (value - old_time);\
+	}
+
+/** This macro updates 3 counters in one call. However, it only checks the
+main/first monitor counter 'monitor', to see it is on or off to decide
+whether to do the update.
+@param monitor the main monitor counter to update. It accounts for
+			the accumulative value for the counter.
+@param monitor_n_calls counter that counts number of times this macro is
+			called
+@param monitor_per_call counter that records the current and max value of
+			each incremental value
+@param value incremental value to record this time */
+#define MONITOR_INC_VALUE_CUMULATIVE(					\
+		monitor, monitor_n_calls, monitor_per_call, value)	\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor_n_calls)++;			\
+		MONITOR_VALUE(monitor_per_call) = (mon_type_t) (value);	\
+		if (MONITOR_VALUE(monitor_per_call)			\
+		    > MONITOR_MAX_VALUE(monitor_per_call)) {		\
+			MONITOR_MAX_VALUE(monitor_per_call) =		\
+				 (mon_type_t) (value);			\
+		}							\
+		MONITOR_VALUE(monitor) += (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/** Directly set a monitor counter's value, and if the value
+is monotonically increasing, only max value needs to be updated */
+#define	MONITOR_SET_UPD_MAX_ONLY(monitor, value)			\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor) = (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/** Some values such as log sequence number are montomically increasing
+number, do not need to record max/min values */
+#define MONITOR_SET_SIMPLE(monitor, value)				\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor) = (mon_type_t) (value);		\
+	}
+
+/** Reset the monitor value and max/min value to zero. The reset
+operation would only be conducted when the counter is turned off */
+#define MONITOR_RESET_ALL(monitor)					\
+	do {								\
+		MONITOR_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_MAX_VALUE(monitor) = MAX_RESERVED;		\
+		MONITOR_MIN_VALUE(monitor) = MIN_RESERVED;		\
+		MONITOR_VALUE_RESET(monitor) = MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED;	\
+		MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED;	\
+		MONITOR_LAST_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_FIELD(monitor, mon_start_time) =		\
+					MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_FIELD(monitor, mon_stop_time) =			\
+					MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_FIELD(monitor, mon_reset_time) =		\
+					MONITOR_INIT_ZERO_VALUE;	\
+	} while (0)
+
+/** Following four macros defines necessary operations to fetch and
+consolidate information from existing system status variables. */
+
+/** Save the passed-in value to mon_start_value field of monitor
+counters */
+#define MONITOR_SAVE_START(monitor, value) do {				\
+	MONITOR_CHECK_DEFINED(value);					\
+	(MONITOR_START_VALUE(monitor) =					\
+		(mon_type_t) (value) - MONITOR_VALUE_RESET(monitor));	\
+	} while (0)
+
+/** Save the passed-in value to mon_last_value field of monitor
+counters */
+#define MONITOR_SAVE_LAST(monitor)					\
+	do {								\
+		MONITOR_LAST_VALUE(monitor) = MONITOR_VALUE(monitor);	\
+		MONITOR_START_VALUE(monitor) += MONITOR_VALUE(monitor);	\
+	} while (0)
+
+/** Set monitor value to the difference of value and mon_start_value
+compensated by mon_last_value if accumulated value is required. */
+#define MONITOR_SET_DIFF(monitor, value)				\
+	MONITOR_SET_UPD_MAX_ONLY(monitor, ((value)			\
+	- MONITOR_VALUE_RESET(monitor)					\
+	- MONITOR_FIELD(monitor, mon_start_value)			\
+	+ MONITOR_FIELD(monitor, mon_last_value)))
+
+/****************************************************************//**
+Get monitor's monitor_info_t by its monitor id (index into the
+innodb_counter_info array
+@return Point to corresponding monitor_info_t, or NULL if no such
+monitor */
+monitor_info_t*
+srv_mon_get_info(
+/*=============*/
+	monitor_id_t	monitor_id);	/*!< id index into the
+					innodb_counter_info array */
+/****************************************************************//**
+Get monitor's name by its monitor id (index into the
+innodb_counter_info array
+@return corresponding monitor name, or NULL if no such
+monitor */
+const char*
+srv_mon_get_name(
+/*=============*/
+	monitor_id_t	monitor_id);	/*!< id index into the
+					innodb_counter_info array */
+
+/****************************************************************//**
+Turn on/off/reset monitor counters in a module. If module_value
+is NUM_MONITOR then turn on all monitor counters.
+@return 0 if successful, or the first monitor that cannot be
+turned on because it is already turned on. */
+void
+srv_mon_set_module_control(
+/*=======================*/
+	monitor_id_t	module_id,	/*!< in: Module ID as in
+					monitor_counter_id. If it is
+					set to NUM_MONITOR, this means
+					we shall turn on all the counters */
+	mon_option_t	set_option);	/*!< in: Turn on/off reset the
+					counter */
+/****************************************************************//**
+This function consolidates some existing server counters used
+by "system status variables". These existing system variables do not have
+mechanism to start/stop and reset the counters, so we simulate these
+controls by remembering the corresponding counter values when the
+corresponding monitors are turned on/off/reset, and do appropriate
+mathematics to deduct the actual value. */
+void
+srv_mon_process_existing_counter(
+/*=============================*/
+	monitor_id_t	monitor_id,	/*!< in: the monitor's ID as in
+					monitor_counter_id */
+	mon_option_t	set_option);	/*!< in: Turn on/off reset the
+					counter */
+/*************************************************************//**
+This function is used to calculate the maximum counter value
+since the start of monitor counter
+@return max counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_max_since_start(
+/*=========================*/
+	monitor_id_t	monitor);	/*!< in: monitor id */
+/*************************************************************//**
+This function is used to calculate the minimum counter value
+since the start of monitor counter
+@return min counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_min_since_start(
+/*=========================*/
+	monitor_id_t	monitor);	/*!< in: monitor id*/
+/*************************************************************//**
+Reset a monitor, create a new base line with the current monitor
+value. This baseline is recorded by MONITOR_VALUE_RESET(monitor) */
+void
+srv_mon_reset(
+/*==========*/
+	monitor_id_t	monitor);	/*!< in: monitor id*/
+/*************************************************************//**
+This function resets all values of a monitor counter */
+UNIV_INLINE
+void
+srv_mon_reset_all(
+/*==============*/
+	monitor_id_t	monitor);	/*!< in: monitor id*/
+/*************************************************************//**
+Turn on monitor counters that are marked as default ON. */
+void
+srv_mon_default_on(void);
+/*====================*/
+
+#include "srv0mon.inl"
+
+#endif
diff --git a/storage/innobase/include/srv0mon.inl b/storage/innobase/include/srv0mon.inl
new file mode 100644
index 00000000..158345b2
--- /dev/null
+++ b/storage/innobase/include/srv0mon.inl
@@ -0,0 +1,113 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/srv0mon.ic
+Server monitoring system
+
+Created 1/20/2010	Jimmy Yang
+************************************************************************/
+
+/*************************************************************//**
+This function is used to calculate the maximum counter value
+since the start of monitor counter
+@return max counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_max_since_start(
+/*=========================*/
+	monitor_id_t	monitor)	/*!< in: monitor id */
+{
+	if (MONITOR_MAX_VALUE_START(monitor) == MAX_RESERVED) {
+
+		/* MONITOR_MAX_VALUE_START has not yet been
+		initialized, the max value since start is the
+		max count in MONITOR_MAX_VALUE */
+		MONITOR_MAX_VALUE_START(monitor) =
+				MONITOR_MAX_VALUE(monitor);
+
+	} else if (MONITOR_MAX_VALUE(monitor) != MAX_RESERVED
+		   && (MONITOR_MAX_VALUE(monitor)
+		       + MONITOR_VALUE_RESET(monitor)
+		      > MONITOR_MAX_VALUE_START(monitor))) {
+
+		/* If the max value since reset (as specified
+		in MONITOR_MAX_VALUE) plus the reset value is
+		larger than MONITOR_MAX_VALUE_START, reset
+		MONITOR_MAX_VALUE_START to this new max value */
+		MONITOR_MAX_VALUE_START(monitor) =
+				MONITOR_MAX_VALUE(monitor)
+				+ MONITOR_VALUE_RESET(monitor);
+	}
+
+	return(MONITOR_MAX_VALUE_START(monitor));
+}
+
+/*************************************************************//**
+This function is used to calculate the minimum counter value
+since the start of monitor counter
+@return min counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_min_since_start(
+/*=========================*/
+	monitor_id_t	monitor)	/*!< in: monitor id */
+{
+	if (MONITOR_MIN_VALUE_START(monitor) == MIN_RESERVED) {
+
+		/* MONITOR_MIN_VALUE_START has not yet been
+		initialized, the min value since start is the
+		min count in MONITOR_MIN_VALUE */
+		MONITOR_MIN_VALUE_START(monitor) =
+				MONITOR_MIN_VALUE(monitor);
+
+	} else if (MONITOR_MIN_VALUE(monitor) != MIN_RESERVED
+		   && (MONITOR_MIN_VALUE(monitor)
+		       + MONITOR_VALUE_RESET(monitor)
+		       < MONITOR_MIN_VALUE_START(monitor))) {
+
+		/* If the min value since reset (as specified
+		in MONITOR_MIN_VALUE) plus the reset value is
+		less than MONITOR_MIN_VALUE_START, reset
+		MONITOR_MIN_VALUE_START to this new min value */
+		MONITOR_MIN_VALUE_START(monitor) =
+			MONITOR_MIN_VALUE(monitor)
+                        + MONITOR_VALUE_RESET(monitor);
+        }
+
+	return(MONITOR_MIN_VALUE_START(monitor));
+}
+
+/*************************************************************//**
+This function resets all values of a monitor counter */
+UNIV_INLINE
+void
+srv_mon_reset_all(
+/*==============*/
+	monitor_id_t	monitor)	/*!< in: monitor id */
+{
+	/* Do not reset all counter values if monitor is still on. */
+	if (MONITOR_IS_ON(monitor)) {
+		fprintf(stderr, "InnoDB: Cannot reset all values for"
+			" monitor counter %s while it is on. Please"
+			" turn it off and retry.\n",
+			srv_mon_get_name(monitor));
+	} else {
+		MONITOR_RESET_ALL(monitor);
+	}
+}
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
new file mode 100644
index 00000000..db846795
--- /dev/null
+++ b/storage/innobase/include/srv0srv.h
@@ -0,0 +1,715 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2008, 2009, Google Inc.
+Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2023, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0srv.h
+The server main program
+
+Created 10/10/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "log0log.h"
+#include "que0types.h"
+#include "trx0types.h"
+#include "fil0fil.h"
+#include "ut0counter.h"
+
+#include "mysql/psi/mysql_stage.h"
+#include "mysql/psi/psi.h"
+#include <tpool.h>
+#include <memory>
+
+/** Simple non-atomic counter
+@tparam	Type  the integer type of the counter */
+template <typename Type>
+struct alignas(CPU_LEVEL1_DCACHE_LINESIZE) simple_counter
+{
+  /** Increment the counter */
+  Type inc() { return add(1); }
+  /** Decrement the counter */
+  Type dec() { return add(Type(~0)); }
+
+  /** Add to the counter
+  @param i  amount to be added
+  @return the value of the counter after adding */
+  Type add(Type i) { return m_counter += i; }
+
+  /** @return the value of the counter */
+  operator Type() const { return m_counter; }
+
+private:
+  /** The counter */
+  Type m_counter;
+};
+
+/** Global counters used inside InnoDB. */
+struct srv_stats_t
+{
+	typedef ib_counter_t<ulint> ulint_ctr_n_t;
+	typedef simple_counter<lsn_t> lsn_ctr_1_t;
+	typedef simple_counter<ulint> ulint_ctr_1_t;
+	typedef simple_counter<int64_t> int64_ctr_1_t;
+
+	/** Count the amount of data written in total (in bytes) */
+	ulint_ctr_1_t		data_written;
+	/** Number of bytes saved by page compression */
+	ulint_ctr_n_t          page_compression_saved;
+	/* Number of pages compressed with page compression */
+        ulint_ctr_n_t          pages_page_compressed;
+	/* Number of TRIM operations induced by page compression */
+        ulint_ctr_n_t          page_compressed_trim_op;
+	/* Number of pages decompressed with page compression */
+        ulint_ctr_n_t          pages_page_decompressed;
+	/* Number of page compression errors */
+	ulint_ctr_n_t          pages_page_compression_error;
+	/* Number of pages encrypted */
+	ulint_ctr_n_t          pages_encrypted;
+   	/* Number of pages decrypted */
+	ulint_ctr_n_t          pages_decrypted;
+	/* Number of merge blocks encrypted */
+	ulint_ctr_n_t          n_merge_blocks_encrypted;
+	/* Number of merge blocks decrypted */
+	ulint_ctr_n_t          n_merge_blocks_decrypted;
+	/* Number of row log blocks encrypted */
+	ulint_ctr_n_t          n_rowlog_blocks_encrypted;
+	/* Number of row log blocks decrypted */
+	ulint_ctr_n_t          n_rowlog_blocks_decrypted;
+
+	/** Number of data read in total (in bytes) */
+	ulint_ctr_1_t		data_read;
+
+	/** Number of encryption_get_latest_key_version calls */
+	ulint_ctr_n_t		n_key_requests;
+
+	/** Number of temporary tablespace blocks encrypted */
+	ulint_ctr_n_t		n_temp_blocks_encrypted;
+
+	/** Number of temporary tablespace blocks decrypted */
+	ulint_ctr_n_t		n_temp_blocks_decrypted;
+};
+
+/** We are prepared for a situation that we have this many threads waiting for
+a transactional lock inside InnoDB. srv_start() sets the value. */
+extern ulint srv_max_n_threads;
+
+extern const char*	srv_main_thread_op_info;
+
+/** Prefix used by MySQL to indicate pre-5.1 table name encoding */
+extern const char	srv_mysql50_table_name_prefix[10];
+
+/** The buffer pool dump/load file name */
+#define SRV_BUF_DUMP_FILENAME_DEFAULT	"ib_buffer_pool"
+extern char*		srv_buf_dump_filename;
+
+/** Boolean config knobs that tell InnoDB to dump the buffer pool at shutdown
+and/or load it during startup. */
+extern char		srv_buffer_pool_dump_at_shutdown;
+extern char		srv_buffer_pool_load_at_startup;
+
+/* Whether to disable file system cache if it is defined */
+extern char		srv_disable_sort_file_cache;
+
+/* If the last data file is auto-extended, we add this many pages to it
+at a time */
+#define SRV_AUTO_EXTEND_INCREMENT (srv_sys_space.get_autoextend_increment())
+
+/** Mutex protecting page_zip_stat_per_index */
+extern mysql_mutex_t page_zip_stat_per_index_mutex;
+/** Mutex for locking srv_monitor_file */
+extern mysql_mutex_t srv_monitor_file_mutex;
+/* Temporary file for innodb monitor output */
+extern FILE*	srv_monitor_file;
+/** Mutex for locking srv_misc_tmpfile */
+extern mysql_mutex_t srv_misc_tmpfile_mutex;
+/* Temporary file for miscellanous diagnostic output */
+extern FILE*	srv_misc_tmpfile;
+
+/* Server parameters which are read from the initfile */
+
+extern char*	srv_data_home;
+
+/** Set if InnoDB must operate in read-only mode. We don't do any
+recovery and open all tables in RO mode instead of RW mode. We don't
+sync the max trx id to disk either. */
+extern my_bool	srv_read_only_mode;
+/** Set if InnoDB operates in read-only mode or innodb-force-recovery
+is greater than SRV_FORCE_NO_IBUF_MERGE. */
+extern my_bool	high_level_read_only;
+/** store to its own file each table created by an user; data
+dictionary tables are in the system tablespace 0 */
+extern my_bool	srv_file_per_table;
+
+/** Sort buffer size in index creation */
+extern ulong	srv_sort_buf_size;
+/** Maximum modification log file size for online index creation */
+extern unsigned long long	srv_online_max_size;
+
+/* If this flag is TRUE, then we will use the native aio of the
+OS (provided we compiled Innobase with it in), otherwise we will
+use simulated aio.
+Currently we support native aio on windows and linux */
+extern my_bool	srv_use_native_aio;
+extern my_bool	srv_numa_interleave;
+
+/* Use atomic writes i.e disable doublewrite buffer */
+extern my_bool srv_use_atomic_writes;
+
+/* Compression algorithm*/
+extern ulong innodb_compression_algorithm;
+
+/** TRUE if the server was successfully started */
+extern bool	srv_was_started;
+
+/** Server undo tablespaces directory, can be absolute path. */
+extern char*	srv_undo_dir;
+
+/** Number of undo tablespaces to use. */
+extern uint	srv_undo_tablespaces;
+
+/** The number of UNDO tablespaces that are active (hosting some rollback
+segment). It is quite possible that some of the tablespaces doesn't host
+any of the rollback-segment based on configuration used. */
+extern uint32_t srv_undo_tablespaces_active;
+
+/** Maximum size of undo tablespace. */
+extern unsigned long long	srv_max_undo_log_size;
+
+extern uint	srv_n_fil_crypt_threads;
+extern uint	srv_n_fil_crypt_threads_started;
+
+/** Rate at which UNDO records should be purged. */
+extern ulong	srv_purge_rseg_truncate_frequency;
+
+/** Enable or Disable Truncate of UNDO tablespace. */
+extern my_bool	srv_undo_log_truncate;
+
+/** Default size of UNDO tablespace (10MiB for innodb_page_size=16k) */
+constexpr ulint SRV_UNDO_TABLESPACE_SIZE_IN_PAGES= (10U << 20) /
+  UNIV_PAGE_SIZE_DEF;
+
+extern char*	srv_log_group_home_dir;
+
+/** The InnoDB redo log file size, or 0 when changing the redo log format
+at startup (while disallowing writes to the redo log). */
+extern ulonglong	srv_log_file_size;
+extern ulong	srv_flush_log_at_trx_commit;
+extern uint	srv_flush_log_at_timeout;
+extern my_bool	srv_adaptive_flushing;
+extern my_bool	srv_flush_sync;
+
+/** Requested size in bytes */
+extern ulint		srv_buf_pool_size;
+/** Requested buffer pool chunk size */
+extern size_t		srv_buf_pool_chunk_unit;
+/** Scan depth for LRU flush batch i.e.: number of blocks scanned*/
+extern ulong	srv_LRU_scan_depth;
+/** Whether or not to flush neighbors of a block */
+extern ulong	srv_flush_neighbors;
+/** Previously requested size */
+extern ulint	srv_buf_pool_old_size;
+/** Current size as scaling factor for the other components */
+extern ulint	srv_buf_pool_base_size;
+/** Current size in bytes */
+extern ulint	srv_buf_pool_curr_size;
+/** Dump this % of each buffer pool during BP dump */
+extern ulong	srv_buf_pool_dump_pct;
+#ifdef UNIV_DEBUG
+/** Abort load after this amount of pages */
+extern ulong srv_buf_pool_load_pages_abort;
+#endif
+/** Lock table size in bytes */
+extern ulint	srv_lock_table_size;
+
+/** the value of innodb_checksum_algorithm */
+extern ulong	srv_checksum_algorithm;
+extern my_bool	srv_random_read_ahead;
+extern ulong	srv_read_ahead_threshold;
+extern uint	srv_n_read_io_threads;
+extern uint	srv_n_write_io_threads;
+
+/* Defragmentation, Origianlly facebook default value is 100, but it's too high */
+#define SRV_DEFRAGMENT_FREQUENCY_DEFAULT 40
+extern my_bool	srv_defragment;
+extern uint	srv_defragment_n_pages;
+extern uint	srv_defragment_stats_accuracy;
+extern uint	srv_defragment_fill_factor_n_recs;
+extern double	srv_defragment_fill_factor;
+extern uint	srv_defragment_frequency;
+extern ulonglong	srv_defragment_interval;
+
+extern uint	srv_change_buffer_max_size;
+
+/* Number of IO operations per second the server can do */
+extern ulong    srv_io_capacity;
+
+/* We use this dummy default value at startup for max_io_capacity.
+The real value is set based on the value of io_capacity. */
+#define SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT	(~0UL)
+#define SRV_MAX_IO_CAPACITY_LIMIT		(~0UL)
+extern ulong    srv_max_io_capacity;
+
+/* The "innodb_stats_method" setting, decides how InnoDB is going
+to treat NULL value when collecting statistics. It is not defined
+as enum type because the configure option takes unsigned integer type. */
+extern ulong	srv_innodb_stats_method;
+
+extern ulint	srv_max_n_open_files;
+
+extern double	srv_max_buf_pool_modified_pct;
+extern double	srv_max_dirty_pages_pct_lwm;
+
+extern double	srv_adaptive_flushing_lwm;
+extern ulong	srv_flushing_avg_loops;
+
+extern ulong	srv_force_recovery;
+
+/** innodb_fast_shutdown=1 skips purge and change buffer merge.
+innodb_fast_shutdown=2 effectively crashes the server (no log checkpoint).
+innodb_fast_shutdown=3 is a clean shutdown that skips the rollback
+of active transaction (to be done on restart). */
+extern uint	srv_fast_shutdown;
+
+extern ibool	srv_innodb_status;
+
+extern unsigned long long	srv_stats_transient_sample_pages;
+extern my_bool			srv_stats_persistent;
+extern unsigned long long	srv_stats_persistent_sample_pages;
+extern my_bool			srv_stats_auto_recalc;
+extern my_bool			srv_stats_include_delete_marked;
+extern unsigned long long	srv_stats_modified_counter;
+extern my_bool			srv_stats_sample_traditional;
+
+extern my_bool	srv_use_doublewrite_buf;
+extern ulong	srv_checksum_algorithm;
+
+extern my_bool	srv_force_primary_key;
+
+extern ulong	srv_max_purge_lag;
+extern ulong	srv_max_purge_lag_delay;
+
+extern my_bool	innodb_encrypt_temporary_tables;
+
+extern my_bool  srv_immediate_scrub_data_uncompressed;
+/*-------------------------------------------*/
+
+/** Modes of operation */
+enum srv_operation_mode {
+	/** Normal mode (MariaDB Server) */
+	SRV_OPERATION_NORMAL,
+	/** Mariabackup is executing server to export already restored
+	tablespaces */
+	SRV_OPERATION_EXPORT_RESTORED,
+	/** Mariabackup taking a backup */
+	SRV_OPERATION_BACKUP,
+	/** Mariabackup restoring a backup for subsequent --copy-back */
+	SRV_OPERATION_RESTORE,
+	/** Mariabackup restoring the incremental part of a backup */
+	SRV_OPERATION_RESTORE_DELTA,
+	/** Mariabackup restoring a backup for subsequent --export */
+	SRV_OPERATION_RESTORE_EXPORT,
+	/** Mariabackup taking a backup and avoid deferring
+	any tablespace */
+	SRV_OPERATION_BACKUP_NO_DEFER
+};
+
+/** Current mode of operation */
+extern enum srv_operation_mode srv_operation;
+
+/** whether this is the server's first start after mariabackup --prepare */
+extern bool srv_start_after_restore;
+
+extern my_bool	srv_print_innodb_monitor;
+extern my_bool	srv_print_innodb_lock_monitor;
+extern ibool	srv_print_verbose_log;
+
+extern bool	srv_monitor_active;
+
+
+extern ulong	srv_n_spin_wait_rounds;
+extern uint	srv_spin_wait_delay;
+
+/** Number of initialized rollback segments for persistent undo log */
+extern ulong	srv_available_undo_logs;
+/** Iterations of the loop bounded by 'srv_active' label. */
+extern ulint	srv_main_active_loops;
+/** Iterations of the loop bounded by the 'srv_idle' label. */
+extern ulint	srv_main_idle_loops;
+/** Log writes involving flush. */
+extern ulint	srv_log_writes_and_flush;
+
+#ifdef UNIV_DEBUG
+extern my_bool	innodb_evict_tables_on_commit_debug;
+extern my_bool	srv_purge_view_update_only_debug;
+
+/** InnoDB system tablespace to set during recovery */
+extern uint	srv_sys_space_size_debug;
+/** whether redo log file has been created at startup */
+extern bool	srv_log_file_created;
+#endif /* UNIV_DEBUG */
+
+extern ulint	srv_dml_needed_delay;
+
+/** innodb_purge_threads; the number of purge tasks to use */
+extern uint srv_n_purge_threads;
+
+/* the number of pages to purge in one batch */
+extern ulong srv_purge_batch_size;
+
+/* print all user-level transactions deadlocks to mysqld stderr */
+extern my_bool srv_print_all_deadlocks;
+
+extern my_bool	srv_cmp_per_index_enabled;
+
+/** innodb_encrypt_log */
+extern my_bool	srv_encrypt_log;
+
+/* is encryption enabled */
+extern ulong	srv_encrypt_tables;
+
+
+/** Status variables to be passed to MySQL */
+extern struct export_var_t export_vars;
+
+/** Global counters */
+extern srv_stats_t	srv_stats;
+
+/** Fatal semaphore wait threshold = maximum number of seconds
+that semaphore times out in InnoDB */
+#define DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT 600
+extern ulong	srv_fatal_semaphore_wait_threshold;
+
+/** Buffer pool dump status frequence in percentages */
+extern ulong srv_buf_dump_status_frequency;
+
+# ifdef UNIV_PFS_THREAD
+extern mysql_pfs_key_t	page_cleaner_thread_key;
+extern mysql_pfs_key_t	trx_rollback_clean_thread_key;
+extern mysql_pfs_key_t	thread_pool_thread_key;
+
+/* This macro register the current thread and its key with performance
+schema */
+#  define pfs_register_thread(key)			\
+do {							\
+	struct PSI_thread* psi __attribute__((unused))	\
+		= PSI_CALL_new_thread(key, NULL, 0);	\
+	PSI_CALL_set_thread_os_id(psi);			\
+	PSI_CALL_set_thread(psi);			\
+} while (0)
+
+/* This macro delist the current thread from performance schema */
+#  define pfs_delete_thread()				\
+do {								\
+	PSI_CALL_delete_current_thread();		\
+} while (0)
+# else
+#  define pfs_register_thread(key)
+#  define pfs_delete_thread()
+# endif /* UNIV_PFS_THREAD */
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+/** Performance schema stage event for monitoring ALTER TABLE progress
+in ha_innobase::commit_inplace_alter_table(). */
+extern PSI_stage_info	srv_stage_alter_table_end;
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_merge_insert_index_tuples(). */
+extern PSI_stage_info	srv_stage_alter_table_insert;
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_log_apply(). */
+extern PSI_stage_info	srv_stage_alter_table_log_index;
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_log_table_apply(). */
+extern PSI_stage_info	srv_stage_alter_table_log_table;
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_merge_sort(). */
+extern PSI_stage_info	srv_stage_alter_table_merge_sort;
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_merge_read_clustered_index(). */
+extern PSI_stage_info	srv_stage_alter_table_read_pk_internal_sort;
+
+/** Performance schema stage event for monitoring buffer pool load progress. */
+extern PSI_stage_info	srv_stage_buffer_pool_load;
+#endif /* HAVE_PSI_STAGE_INTERFACE */
+
+/** Alternatives for srv_force_recovery. Non-zero values are intended
+to help the user get a damaged database up so that he can dump intact
+tables and rows with SELECT INTO OUTFILE. The database must not otherwise
+be used with these options! A bigger number below means that all precautions
+of lower numbers are included. */
+enum {
+	SRV_FORCE_IGNORE_CORRUPT = 1,	/*!< let the server run even if it
+					detects a corrupt page */
+	SRV_FORCE_NO_BACKGROUND	= 2,	/*!< prevent the main thread from
+					running: if a crash would occur
+					in purge, this prevents it */
+	SRV_FORCE_NO_TRX_UNDO = 3,	/*!< do not run DML rollback after
+					recovery */
+	SRV_FORCE_NO_DDL_UNDO = 4,	/*!< prevent also DDL rollback */
+	SRV_FORCE_NO_UNDO_LOG_SCAN = 5,	/*!< do not look at undo logs when
+					starting the database: InnoDB will
+					treat even incomplete transactions
+					as committed */
+	SRV_FORCE_NO_LOG_REDO = 6	/*!< do not do the log roll-forward
+					in connection with recovery */
+};
+
+/* Alternatives for srv_innodb_stats_method, which could be changed by
+setting innodb_stats_method */
+enum srv_stats_method_name_enum {
+	SRV_STATS_NULLS_EQUAL,		/* All NULL values are treated as
+					equal. This is the default setting
+					for innodb_stats_method */
+	SRV_STATS_NULLS_UNEQUAL,	/* All NULL values are treated as
+					NOT equal. */
+	SRV_STATS_NULLS_IGNORED		/* NULL values are ignored */
+};
+
+typedef enum srv_stats_method_name_enum		srv_stats_method_name_t;
+
+/*********************************************************************//**
+Boots Innobase server. */
+void
+srv_boot(void);
+/*==========*/
+/*********************************************************************//**
+Frees the data structures created in srv_init(). */
+void
+srv_free(void);
+
+/******************************************************************//**
+Outputs to a file the output of the InnoDB Monitor.
+@return FALSE if not all information printed
+due to failure to obtain necessary mutex */
+ibool
+srv_printf_innodb_monitor(
+/*======================*/
+	FILE*	file,		/*!< in: output stream */
+	ibool	nowait,		/*!< in: whether to wait for lock_sys.latch */
+	ulint*	trx_start,	/*!< out: file position of the start of
+				the list of active transactions */
+	ulint*	trx_end);	/*!< out: file position of the end of
+				the list of active transactions */
+
+/******************************************************************//**
+Function to pass InnoDB status variables to MySQL */
+void
+srv_export_innodb_status(void);
+/*==========================*/
+/*******************************************************************//**
+Get current server activity count.
+@return activity count. */
+ulint
+srv_get_activity_count(void);
+/*========================*/
+
+/******************************************************************//**
+Increment the server activity counter. */
+void
+srv_inc_activity_count(void);
+/*=========================*/
+
+/**********************************************************************//**
+Enqueues a task to server task queue and releases a worker thread, if there
+is a suspended one. */
+void
+srv_que_task_enqueue_low(
+/*=====================*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+#ifdef UNIV_DEBUG
+/** @return whether purge or master task is active */
+bool srv_any_background_activity();
+#endif
+
+extern "C" {
+
+
+/** Periodic task which prints the info output by various InnoDB monitors.*/
+void srv_monitor_task(void*);
+
+
+/** The periodic master task controlling the server. */
+void srv_master_callback(void*);
+
+
+/**
+Complete the shutdown tasks such as background DROP TABLE,
+and optionally change buffer merge (on innodb_fast_shutdown=0). */
+void srv_shutdown(bool ibuf_merge);
+
+} /* extern "C" */
+
+#ifdef UNIV_DEBUG
+/** @return number of tasks in queue */
+ulint srv_get_task_queue_length();
+#endif
+
+/** Shut down the purge threads. */
+void srv_purge_shutdown();
+
+/** Init purge tasks*/
+void srv_init_purge_tasks();
+
+/** Status variables to be passed to MySQL */
+struct export_var_t{
+#ifdef BTR_CUR_HASH_ADAPT
+	ulint innodb_ahi_hit;
+	ulint innodb_ahi_miss;
+#endif /* BTR_CUR_HASH_ADAPT */
+	char  innodb_buffer_pool_dump_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool dump status */
+	char  innodb_buffer_pool_load_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool load status */
+	char  innodb_buffer_pool_resize_status[512];/*!< Buf pool resize status */
+	my_bool innodb_buffer_pool_load_incomplete;/*!< Buf pool load incomplete */
+	ulint innodb_buffer_pool_pages_total;	/*!< Buffer pool size */
+	ulint innodb_buffer_pool_bytes_data;	/*!< File bytes used */
+	ulint innodb_buffer_pool_pages_misc;	/*!< Miscellanous pages */
+#ifdef UNIV_DEBUG
+	ulint innodb_buffer_pool_pages_latched;	/*!< Latched pages */
+#endif /* UNIV_DEBUG */
+	/** buf_pool.stat.n_page_gets (a sharded counter) */
+	ulint innodb_buffer_pool_read_requests;
+	ulint innodb_checkpoint_age;
+	ulint innodb_checkpoint_max_age;
+	ulint innodb_data_pending_reads;	/*!< Pending reads */
+	ulint innodb_data_pending_writes;	/*!< Pending writes */
+	ulint innodb_data_read;			/*!< Data bytes read */
+	ulint innodb_data_writes;		/*!< I/O write requests */
+	ulint innodb_data_written;		/*!< Data bytes written */
+	ulint innodb_data_reads;		/*!< I/O read requests */
+	ulint innodb_dblwr_pages_written;	/*!< srv_dblwr_pages_written */
+	ulint innodb_dblwr_writes;		/*!< srv_dblwr_writes */
+	ulint innodb_deadlocks;
+	ulint innodb_history_list_length;
+	lsn_t innodb_lsn_current;
+	lsn_t innodb_lsn_flushed;
+	lsn_t innodb_lsn_last_checkpoint;
+	trx_id_t innodb_max_trx_id;
+#ifdef BTR_CUR_HASH_ADAPT
+	ulint innodb_mem_adaptive_hash;
+#endif
+	ulint innodb_mem_dictionary;
+	/** log_sys.get_lsn() - recv_sys.lsn */
+	lsn_t innodb_os_log_written;
+	ulint innodb_row_lock_waits;		/*!< srv_n_lock_wait_count */
+	ulint innodb_row_lock_current_waits;	/*!< srv_n_lock_wait_current_count */
+	int64_t innodb_row_lock_time;		/*!< srv_n_lock_wait_time
+						/ 1000 */
+	uint64_t innodb_row_lock_time_avg;	/*!< srv_n_lock_wait_time
+						     / srv_n_lock_wait_count */
+	uint64_t innodb_row_lock_time_max;	/*!< srv_n_lock_max_wait_time */
+
+	/** Number of undo tablespace truncation operations */
+	ulong innodb_undo_truncations;
+	ulint innodb_defragment_compression_failures; /*!< Number of
+						defragment re-compression
+						failures */
+
+	ulint innodb_defragment_failures;	/*!< Number of defragment
+						failures*/
+	ulint innodb_defragment_count;		/*!< Number of defragment
+						operations*/
+
+	/** Number of instant ALTER TABLE operations that affect columns */
+	ulong innodb_instant_alter_column;
+
+	ulint innodb_onlineddl_rowlog_rows;	/*!< Online alter rows */
+	ulint innodb_onlineddl_rowlog_pct_used; /*!< Online alter percentage
+						of used row log buffer */
+	ulint innodb_onlineddl_pct_progress;	/*!< Online alter progress */
+
+	int64_t innodb_page_compression_saved;/*!< Number of bytes saved
+						by page compression */
+	int64_t innodb_pages_page_compressed;/*!< Number of pages
+						compressed by page compression */
+	int64_t innodb_page_compressed_trim_op;/*!< Number of TRIM operations
+						induced by page compression */
+	int64_t innodb_pages_page_decompressed;/*!< Number of pages
+						decompressed by page
+						compression */
+	int64_t innodb_pages_page_compression_error;/*!< Number of page
+						compression errors */
+	int64_t innodb_pages_encrypted;      /*!< Number of pages
+						encrypted */
+	int64_t innodb_pages_decrypted;      /*!< Number of pages
+						decrypted */
+
+	/*!< Number of merge blocks encrypted */
+	ib_int64_t innodb_n_merge_blocks_encrypted;
+	/*!< Number of merge blocks decrypted */
+	ib_int64_t innodb_n_merge_blocks_decrypted;
+	/*!< Number of row log blocks encrypted */
+	ib_int64_t innodb_n_rowlog_blocks_encrypted;
+	/*!< Number of row log blocks decrypted */
+	ib_int64_t innodb_n_rowlog_blocks_decrypted;
+
+	/* Number of temporary tablespace pages encrypted */
+	ib_int64_t innodb_n_temp_blocks_encrypted;
+
+	/* Number of temporary tablespace pages decrypted */
+	ib_int64_t innodb_n_temp_blocks_decrypted;
+
+	ulint innodb_encryption_rotation_pages_read_from_cache;
+	ulint innodb_encryption_rotation_pages_read_from_disk;
+	ulint innodb_encryption_rotation_pages_modified;
+	ulint innodb_encryption_rotation_pages_flushed;
+	ulint innodb_encryption_rotation_estimated_iops;
+	int64_t innodb_encryption_key_requests;
+};
+
+extern tpool::thread_pool *srv_thread_pool;
+extern std::unique_ptr<tpool::timer> srv_master_timer;
+extern std::unique_ptr<tpool::timer> srv_monitor_timer;
+
+/** The interval at which srv_monitor_task is invoked, in milliseconds */
+constexpr unsigned SRV_MONITOR_INTERVAL= 15000; /* 4 times per minute */
+
+static inline void srv_monitor_timer_schedule_now()
+{
+  srv_monitor_timer->set_time(0, SRV_MONITOR_INTERVAL);
+}
+static inline void srv_start_periodic_timer(std::unique_ptr<tpool::timer>& t,
+                                            void (*func)(void*), int period)
+{
+  t.reset(srv_thread_pool->create_timer(func));
+  t->set_time(0, period);
+}
+
+void srv_thread_pool_init();
+void srv_thread_pool_end();
diff --git a/storage/innobase/include/srv0start.h b/storage/innobase/include/srv0start.h
new file mode 100644
index 00000000..c18cf1ce
--- /dev/null
+++ b/storage/innobase/include/srv0start.h
@@ -0,0 +1,124 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0start.h
+Starts the Innobase database server
+
+Created 10/10/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "log0log.h"
+#include "ut0byte.h"
+
+// Forward declaration
+struct dict_table_t;
+
+/** Open the configured number of dedicated undo tablespaces.
+@param[in]      create_new_undo whether the undo tablespaces has to be created
+@param[in,out]  mtr             mini-transaction
+@return DB_SUCCESS or error code */
+dberr_t srv_undo_tablespaces_init(bool create_new_undo, mtr_t *mtr);
+
+/** Start InnoDB.
+@param[in]	create_new_db	whether to create a new database
+@return DB_SUCCESS or error code */
+dberr_t srv_start(bool create_new_db);
+
+/**
+  Shutdown purge to make sure that there is no possibility that we call any
+  plugin code (e.g., audit) inside virtual column computation.
+*/
+void innodb_preshutdown();
+
+/** Shut down InnoDB. */
+void innodb_shutdown();
+
+/*************************************************************//**
+Copy the file path component of the physical file to parameter. It will
+copy up to and including the terminating path separator.
+@return number of bytes copied or ULINT_UNDEFINED if destination buffer
+	is smaller than the path to be copied. */
+ulint
+srv_path_copy(
+/*==========*/
+	char*		dest,		/*!< out: destination buffer */
+	ulint		dest_len,	/*!< in: max bytes to copy */
+	const char*	basedir,	/*!< in: base directory */
+	const char*	table_name)	/*!< in: source table name */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Get the meta-data filename from the table name for a
+single-table tablespace.
+@param[in]	table		table object
+@param[out]	filename	filename
+@param[in]	max_len		filename max length */
+void
+srv_get_meta_data_filename(
+	dict_table_t*	table,
+	char*		filename,
+	ulint		max_len);
+
+/** Get the encryption-data filename from the table name for a
+single-table tablespace.
+@param[in]	table		table object
+@param[out]	filename	filename
+@param[in]	max_len		filename max length */
+void
+srv_get_encryption_data_filename(
+	dict_table_t*	table,
+	char*		filename,
+	ulint		max_len);
+
+/** Log sequence number at shutdown */
+extern	lsn_t	srv_shutdown_lsn;
+
+/** TRUE if the server is being started */
+extern	bool	srv_is_being_started;
+/** TRUE if the server is being started, before rolling back any
+incomplete transactions */
+extern	bool	srv_startup_is_before_trx_rollback_phase;
+
+/** TRUE if a raw partition is in use */
+extern	ibool	srv_start_raw_disk_in_use;
+
+/** Shutdown state */
+enum srv_shutdown_t {
+	SRV_SHUTDOWN_NONE = 0,	/*!< Database running normally */
+	/** Shutdown initiated in srv_shutdown_bg_undo_sources() */
+	SRV_SHUTDOWN_INITIATED,
+	SRV_SHUTDOWN_CLEANUP,	/*!< Cleaning up in
+				logs_empty_and_mark_files_at_shutdown() */
+	SRV_SHUTDOWN_LAST_PHASE,/*!< Last phase after ensuring that
+				the buffer pool can be freed: flush
+				all file spaces and close all files */
+	SRV_SHUTDOWN_EXIT_THREADS/*!< Exit all threads */
+};
+
+/** Whether any undo log records can be generated */
+extern bool srv_undo_sources;
+
+/** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to
+SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */
+extern	enum srv_shutdown_t	srv_shutdown_state;
+
+/** Files comprising the system tablespace */
+extern pfs_os_file_t	files[1000];
diff --git a/storage/innobase/include/srw_lock.h b/storage/innobase/include/srw_lock.h
new file mode 100644
index 00000000..1dca0cc1
--- /dev/null
+++ b/storage/innobase/include/srw_lock.h
@@ -0,0 +1,554 @@
+/*****************************************************************************
+
+Copyright (c) 2020, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+#include "univ.i"
+#include "rw_lock.h"
+
+#if defined __linux__
+/* futex(2): FUTEX_WAIT_PRIVATE, FUTEX_WAKE_PRIVATE */
+#elif defined __OpenBSD__ || defined __FreeBSD__ || defined __DragonFly__
+/* system calls similar to Linux futex(2) */
+#elif defined _WIN32
+/* SRWLOCK as well as WaitOnAddress(), WakeByAddressSingle() */
+#else
+# define SUX_LOCK_GENERIC /* fall back to generic synchronization primitives */
+#endif
+
+#if !defined SUX_LOCK_GENERIC && 0 /* defined SAFE_MUTEX */
+# define SUX_LOCK_GENERIC /* Use dummy implementation for debugging purposes */
+#endif
+
+#ifdef SUX_LOCK_GENERIC
+/** An exclusive-only variant of srw_lock */
+template<bool spinloop>
+class pthread_mutex_wrapper final
+{
+  pthread_mutex_t lock;
+public:
+  void init()
+  {
+    if (spinloop)
+      pthread_mutex_init(&lock, MY_MUTEX_INIT_FAST);
+    else
+      pthread_mutex_init(&lock, nullptr);
+  }
+  void destroy() { pthread_mutex_destroy(&lock); }
+# ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+  void wr_lock() { pthread_mutex_lock(&lock); }
+# else
+private:
+  void wr_wait();
+public:
+  inline void wr_lock();
+# endif
+  void wr_unlock() { pthread_mutex_unlock(&lock); }
+  bool wr_lock_try() { return !pthread_mutex_trylock(&lock); }
+};
+
+# ifndef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+template<> void pthread_mutex_wrapper<true>::wr_wait();
+template<>
+inline void pthread_mutex_wrapper<false>::wr_lock()
+{ pthread_mutex_lock(&lock); }
+template<>
+inline void pthread_mutex_wrapper<true>::wr_lock()
+{ if (!wr_lock_try()) wr_wait(); }
+# endif
+#endif
+
+/** Futex-based mutex */
+template<bool spinloop>
+class srw_mutex_impl final
+{
+  /** The lock word, containing HOLDER + 1 if the lock is being held,
+  plus the number of waiters */
+  std::atomic<uint32_t> lock;
+  /** Identifies that the lock is being held */
+  static constexpr uint32_t HOLDER= 1U << 31;
+
+#ifdef SUX_LOCK_GENERIC
+public:
+  /** The mutex for the condition variables. */
+  pthread_mutex_t mutex;
+private:
+  /** Condition variable for the lock word. Used with mutex. */
+  pthread_cond_t cond;
+#endif
+
+  /** Wait until the mutex has been acquired */
+  void wait_and_lock();
+  /** Wait for lock!=lk */
+  inline void wait(uint32_t lk);
+  /** Wake up one wait() thread */
+  void wake();
+public:
+  /** @return whether the mutex is being held or waited for */
+  bool is_locked_or_waiting() const
+  { return lock.load(std::memory_order_acquire) != 0; }
+  /** @return whether the mutex is being held by any thread */
+  bool is_locked() const
+  { return (lock.load(std::memory_order_acquire) & HOLDER) != 0; }
+
+  void init()
+  {
+    DBUG_ASSERT(!is_locked_or_waiting());
+#ifdef SUX_LOCK_GENERIC
+    pthread_mutex_init(&mutex, nullptr);
+    pthread_cond_init(&cond, nullptr);
+#endif
+  }
+  void destroy()
+  {
+    DBUG_ASSERT(!is_locked_or_waiting());
+#ifdef SUX_LOCK_GENERIC
+    pthread_mutex_destroy(&mutex);
+    pthread_cond_destroy(&cond);
+#endif
+  }
+
+  /** @return whether the mutex was acquired */
+  bool wr_lock_try()
+  {
+    uint32_t lk= 0;
+    return lock.compare_exchange_strong(lk, HOLDER + 1,
+                                        std::memory_order_acquire,
+                                        std::memory_order_relaxed);
+  }
+
+  void wr_lock() { if (!wr_lock_try()) wait_and_lock(); }
+  void wr_unlock()
+  {
+    const uint32_t lk= lock.fetch_sub(HOLDER + 1, std::memory_order_release);
+    if (lk != HOLDER + 1)
+    {
+      DBUG_ASSERT(lk & HOLDER);
+      wake();
+    }
+  }
+};
+
+#ifdef SUX_LOCK_GENERIC
+typedef pthread_mutex_wrapper<true> srw_spin_mutex;
+typedef pthread_mutex_wrapper<false> srw_mutex;
+#else
+typedef srw_mutex_impl<true> srw_spin_mutex;
+typedef srw_mutex_impl<false> srw_mutex;
+#endif
+
+template<bool spinloop> class srw_lock_impl;
+
+/** Slim shared-update-exclusive lock with no recursion */
+template<bool spinloop>
+class ssux_lock_impl final
+{
+#ifdef UNIV_PFS_RWLOCK
+  friend class ssux_lock;
+# ifdef SUX_LOCK_GENERIC
+# elif defined _WIN32
+# else
+  friend srw_lock_impl<spinloop>;
+# endif
+#endif
+  /** mutex for synchronization; held by U or X lock holders */
+  srw_mutex_impl<spinloop> writer;
+#ifdef SUX_LOCK_GENERIC
+  /** Condition variable for "readers"; used with writer.mutex. */
+  pthread_cond_t readers_cond;
+#endif
+  /** S or U holders, and WRITER flag for X holder or waiter */
+  std::atomic<uint32_t> readers;
+  /** indicates an X request; readers=WRITER indicates granted X lock */
+  static constexpr uint32_t WRITER= 1U << 31;
+
+  /** Wait for readers!=lk */
+  inline void wait(uint32_t lk);
+
+  /** Wait for readers!=lk|WRITER */
+  void wr_wait(uint32_t lk);
+  /** Wake up wait() on the last rd_unlock() */
+  void wake();
+  /** Acquire a read lock */
+  void rd_wait();
+public:
+  void init()
+  {
+    writer.init();
+    DBUG_ASSERT(is_vacant());
+#ifdef SUX_LOCK_GENERIC
+    pthread_cond_init(&readers_cond, nullptr);
+#endif
+  }
+  void destroy()
+  {
+    DBUG_ASSERT(is_vacant());
+    writer.destroy();
+#ifdef SUX_LOCK_GENERIC
+    pthread_cond_destroy(&readers_cond);
+#endif
+  }
+  /** @return whether any writer is waiting */
+  bool is_waiting() const
+  { return (readers.load(std::memory_order_relaxed) & WRITER) != 0; }
+#ifndef DBUG_OFF
+  /** @return whether the lock is being held or waited for */
+  bool is_vacant() const { return !is_locked_or_waiting(); }
+#endif /* !DBUG_OFF */
+
+  bool rd_lock_try()
+  {
+    uint32_t lk= 0;
+    while (!readers.compare_exchange_weak(lk, lk + 1,
+                                          std::memory_order_acquire,
+                                          std::memory_order_relaxed))
+      if (lk & WRITER)
+        return false;
+    return true;
+  }
+
+  bool u_lock_try()
+  {
+    if (!writer.wr_lock_try())
+      return false;
+    IF_DBUG_ASSERT(uint32_t lk=,)
+    readers.fetch_add(1, std::memory_order_acquire);
+    DBUG_ASSERT(lk < WRITER - 1);
+    return true;
+  }
+
+  bool wr_lock_try()
+  {
+    if (!writer.wr_lock_try())
+      return false;
+    uint32_t lk= 0;
+    if (readers.compare_exchange_strong(lk, WRITER,
+                                        std::memory_order_acquire,
+                                        std::memory_order_relaxed))
+      return true;
+    writer.wr_unlock();
+    return false;
+  }
+
+  void rd_lock() { if (!rd_lock_try()) rd_wait(); }
+  void u_lock()
+  {
+    writer.wr_lock();
+    IF_DBUG_ASSERT(uint32_t lk=,)
+    readers.fetch_add(1, std::memory_order_acquire);
+    DBUG_ASSERT(lk < WRITER - 1);
+  }
+  void wr_lock()
+  {
+    writer.wr_lock();
+#if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
+    /* On IA-32 and AMD64, this type of fetch_or() can only be implemented
+    as a loop around LOCK CMPXCHG. In this particular case, setting the
+    most significant bit using fetch_add() is equivalent, and is
+    translated into a simple LOCK XADD. */
+    static_assert(WRITER == 1U << 31, "compatibility");
+    if (uint32_t lk= readers.fetch_add(WRITER, std::memory_order_acquire))
+      wr_wait(lk);
+#else
+    if (uint32_t lk= readers.fetch_or(WRITER, std::memory_order_acquire))
+      wr_wait(lk);
+#endif
+  }
+
+  void u_wr_upgrade()
+  {
+    DBUG_ASSERT(writer.is_locked());
+    uint32_t lk= readers.fetch_add(WRITER - 1, std::memory_order_acquire);
+    if (lk != 1)
+      wr_wait(lk - 1);
+  }
+  void wr_u_downgrade()
+  {
+    DBUG_ASSERT(writer.is_locked());
+    DBUG_ASSERT(is_write_locked());
+    readers.store(1, std::memory_order_release);
+    /* Note: Any pending rd_lock() will not be woken up until u_unlock() */
+  }
+
+  void rd_unlock()
+  {
+    uint32_t lk= readers.fetch_sub(1, std::memory_order_release);
+    ut_ad(~WRITER & lk);
+    if (lk == WRITER + 1)
+      wake();
+  }
+  void u_unlock()
+  {
+    IF_DBUG_ASSERT(uint32_t lk=,)
+    readers.fetch_sub(1, std::memory_order_release);
+    DBUG_ASSERT(lk);
+    DBUG_ASSERT(lk < WRITER);
+    writer.wr_unlock();
+  }
+  void wr_unlock()
+  {
+    DBUG_ASSERT(is_write_locked());
+    readers.store(0, std::memory_order_release);
+    writer.wr_unlock();
+  }
+  /** @return whether an exclusive lock may be held by any thread */
+  bool is_write_locked() const noexcept
+  { return readers.load(std::memory_order_acquire) == WRITER; }
+  /** @return whether any lock may be held by any thread */
+  bool is_locked() const noexcept
+  { return readers.load(std::memory_order_acquire) != 0; }
+  /** @return whether any lock may be held by any thread */
+  bool is_locked_or_waiting() const noexcept
+  { return is_locked() || writer.is_locked_or_waiting(); }
+
+  void lock_shared() { rd_lock(); }
+  void unlock_shared() { rd_unlock(); }
+  void lock() { wr_lock(); }
+  void unlock() { wr_unlock(); }
+};
+
+#if defined _WIN32 || defined SUX_LOCK_GENERIC
+/** Slim read-write lock */
+template<bool spinloop>
+class srw_lock_
+{
+# ifdef UNIV_PFS_RWLOCK
+  friend srw_lock_impl<spinloop>;
+# endif
+# ifdef _WIN32
+  SRWLOCK lk;
+# else
+  rw_lock_t lk;
+# endif
+
+  void rd_wait();
+  void wr_wait();
+public:
+  void init() { IF_WIN(,my_rwlock_init(&lk, nullptr)); }
+  void destroy() { IF_WIN(,rwlock_destroy(&lk)); }
+  inline void rd_lock();
+  inline void wr_lock();
+  bool rd_lock_try()
+  { return IF_WIN(TryAcquireSRWLockShared(&lk), !rw_tryrdlock(&lk)); }
+  void rd_unlock()
+  { IF_WIN(ReleaseSRWLockShared(&lk), rw_unlock(&lk)); }
+  bool wr_lock_try()
+  { return IF_WIN(TryAcquireSRWLockExclusive(&lk), !rw_trywrlock(&lk)); }
+  void wr_unlock()
+  { IF_WIN(ReleaseSRWLockExclusive(&lk), rw_unlock(&lk)); }
+#ifdef _WIN32
+  /** @return whether any lock may be held by any thread */
+  bool is_locked_or_waiting() const noexcept { return (size_t&)(lk) != 0; }
+  /** @return whether any lock may be held by any thread */
+  bool is_locked() const noexcept { return is_locked_or_waiting(); }
+  /** @return whether an exclusive lock may be held by any thread */
+  bool is_write_locked() const noexcept
+  {
+    // FIXME: this returns false positives for shared locks
+    return is_locked();
+  }
+
+  void lock_shared() { rd_lock(); }
+  void unlock_shared() { rd_unlock(); }
+  void lock() { wr_lock(); }
+  void unlock() { wr_unlock(); }
+#endif
+};
+
+template<> void srw_lock_<true>::rd_wait();
+template<> void srw_lock_<true>::wr_wait();
+
+template<>
+inline void srw_lock_<false>::rd_lock()
+{ IF_WIN(AcquireSRWLockShared(&lk), rw_rdlock(&lk)); }
+template<>
+inline void srw_lock_<false>::wr_lock()
+{ IF_WIN(AcquireSRWLockExclusive(&lk), rw_wrlock(&lk)); }
+
+template<>
+inline void srw_lock_<true>::rd_lock() { if (!rd_lock_try()) rd_wait(); }
+template<>
+inline void srw_lock_<true>::wr_lock() { if (!wr_lock_try()) wr_wait(); }
+
+typedef srw_lock_<false> srw_lock_low;
+typedef srw_lock_<true> srw_spin_lock_low;
+#else
+typedef ssux_lock_impl<false> srw_lock_low;
+typedef ssux_lock_impl<true> srw_spin_lock_low;
+#endif
+
+#ifndef UNIV_PFS_RWLOCK
+# define SRW_LOCK_INIT(key) init()
+# define SRW_LOCK_ARGS(file, line) /* nothing */
+# define SRW_LOCK_CALL /* nothing */
+typedef srw_lock_low srw_lock;
+typedef srw_spin_lock_low srw_spin_lock;
+#else
+# define SRW_LOCK_INIT(key) init(key)
+# define SRW_LOCK_ARGS(file, line) file, line
+# define SRW_LOCK_CALL __FILE__, __LINE__
+
+/** Slim shared-update-exclusive lock with PERFORMANCE_SCHEMA instrumentation */
+class ssux_lock
+{
+  PSI_rwlock *pfs_psi;
+  ssux_lock_impl<false> lock;
+
+  ATTRIBUTE_NOINLINE void psi_rd_lock(const char *file, unsigned line);
+  ATTRIBUTE_NOINLINE void psi_wr_lock(const char *file, unsigned line);
+  ATTRIBUTE_NOINLINE void psi_u_lock(const char *file, unsigned line);
+  ATTRIBUTE_NOINLINE void psi_u_wr_upgrade(const char *file, unsigned line);
+public:
+  void init(mysql_pfs_key_t key)
+  {
+    pfs_psi= PSI_RWLOCK_CALL(init_rwlock)(key, this);
+    lock.init();
+  }
+  void destroy()
+  {
+    if (psi_likely(pfs_psi != nullptr))
+    {
+      PSI_RWLOCK_CALL(destroy_rwlock)(pfs_psi);
+      pfs_psi= nullptr;
+    }
+    lock.destroy();
+  }
+  void rd_lock(const char *file, unsigned line)
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      psi_rd_lock(file, line);
+    else
+      lock.rd_lock();
+  }
+  void rd_unlock()
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+    lock.rd_unlock();
+  }
+  void u_lock(const char *file, unsigned line)
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      psi_u_lock(file, line);
+    else
+      lock.u_lock();
+  }
+  void u_unlock()
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+    lock.u_unlock();
+  }
+  void wr_lock(const char *file, unsigned line)
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      psi_wr_lock(file, line);
+    else
+      lock.wr_lock();
+  }
+  void wr_unlock()
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+    lock.wr_unlock();
+  }
+  void u_wr_upgrade(const char *file, unsigned line)
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      psi_u_wr_upgrade(file, line);
+    else
+      lock.u_wr_upgrade();
+  }
+  bool rd_lock_try() { return lock.rd_lock_try(); }
+  bool u_lock_try() { return lock.u_lock_try(); }
+  bool wr_lock_try() { return lock.wr_lock_try(); }
+  bool is_waiting() const { return lock.is_waiting(); }
+};
+
+/** Slim reader-writer lock with PERFORMANCE_SCHEMA instrumentation */
+template<bool spinloop>
+class srw_lock_impl
+{
+  PSI_rwlock *pfs_psi;
+# if defined _WIN32 || defined SUX_LOCK_GENERIC
+  srw_lock_<spinloop> lock;
+# else
+  ssux_lock_impl<spinloop> lock;
+# endif
+
+  ATTRIBUTE_NOINLINE void psi_rd_lock(const char *file, unsigned line);
+  ATTRIBUTE_NOINLINE void psi_wr_lock(const char *file, unsigned line);
+public:
+  void init(mysql_pfs_key_t key)
+  {
+    pfs_psi= PSI_RWLOCK_CALL(init_rwlock)(key, this);
+    lock.init();
+  }
+  void destroy()
+  {
+    if (psi_likely(pfs_psi != nullptr))
+    {
+      PSI_RWLOCK_CALL(destroy_rwlock)(pfs_psi);
+      pfs_psi= nullptr;
+    }
+    lock.destroy();
+  }
+  void rd_lock(const char *file, unsigned line)
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      psi_rd_lock(file, line);
+    else
+      lock.rd_lock();
+  }
+  void rd_unlock()
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+    lock.rd_unlock();
+  }
+  void wr_lock(const char *file, unsigned line)
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      psi_wr_lock(file, line);
+    else
+      lock.wr_lock();
+  }
+  void wr_unlock()
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+    lock.wr_unlock();
+  }
+  bool rd_lock_try() { return lock.rd_lock_try(); }
+  bool wr_lock_try() { return lock.wr_lock_try(); }
+  void lock_shared() { return rd_lock(SRW_LOCK_CALL); }
+  void unlock_shared() { return rd_unlock(); }
+#ifndef SUX_LOCK_GENERIC
+  /** @return whether any lock may be held by any thread */
+  bool is_locked_or_waiting() const noexcept
+  { return lock.is_locked_or_waiting(); }
+  /** @return whether an exclusive lock may be held by any thread */
+  bool is_locked() const noexcept { return lock.is_locked(); }
+  /** @return whether an exclusive lock may be held by any thread */
+  bool is_write_locked() const noexcept { return lock.is_write_locked(); }
+#endif
+};
+
+typedef srw_lock_impl<false> srw_lock;
+typedef srw_lock_impl<true> srw_spin_lock;
+
+#endif
diff --git a/storage/innobase/include/sux_lock.h b/storage/innobase/include/sux_lock.h
new file mode 100644
index 00000000..2c0167ac
--- /dev/null
+++ b/storage/innobase/include/sux_lock.h
@@ -0,0 +1,472 @@
+/*****************************************************************************
+
+Copyright (c) 2020, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+#include "srw_lock.h"
+#include "my_atomic_wrapper.h"
+#ifdef UNIV_DEBUG
+# include <unordered_set>
+#endif
+
+/** A "fat" rw-lock that supports
+S (shared), U (update, or shared-exclusive), and X (exclusive) modes
+as well as recursive U and X latch acquisition
+@tparam ssux ssux_lock_impl or ssux_lock */
+template<typename ssux>
+class sux_lock final
+{
+  /** The underlying non-recursive lock */
+  ssux lock;
+  /** Numbers of U and X locks. Protected by lock. */
+  uint32_t recursive;
+  /** The owner of the U or X lock (0 if none); protected by lock */
+  std::atomic<pthread_t> writer;
+  /** Special writer!=0 value to indicate that the lock is non-recursive
+  and will be released by an I/O thread */
+#if defined __linux__ || defined _WIN32
+  static constexpr pthread_t FOR_IO= pthread_t(~0UL);
+#else
+# define FOR_IO ((pthread_t) ~0UL) /* it could be a pointer */
+#endif
+#ifdef UNIV_DEBUG
+  /** Protects readers */
+  mutable srw_mutex readers_lock;
+  /** Threads that hold the lock in shared mode */
+  std::atomic<std::unordered_multiset<pthread_t>*> readers;
+#endif
+
+  /** The multiplier in recursive for X locks */
+  static constexpr uint32_t RECURSIVE_X= 1U;
+  /** The multiplier in recursive for U locks */
+  static constexpr uint32_t RECURSIVE_U= 1U << 16;
+  /** The maximum allowed level of recursion */
+  static constexpr uint32_t RECURSIVE_MAX= RECURSIVE_U - 1;
+
+public:
+#ifdef UNIV_PFS_RWLOCK
+  inline void init();
+#endif
+  void SRW_LOCK_INIT(mysql_pfs_key_t key)
+  {
+    lock.SRW_LOCK_INIT(key);
+    ut_ad(!writer.load(std::memory_order_relaxed));
+    ut_ad(!recursive);
+    ut_d(readers_lock.init());
+#ifdef UNIV_DEBUG
+    if (auto r= readers.load(std::memory_order_relaxed))
+      ut_ad(r->empty());
+#endif
+  }
+
+  /** Free the rw-lock after init() */
+  void free()
+  {
+    ut_ad(!writer.load(std::memory_order_relaxed));
+    ut_ad(!recursive);
+#ifdef UNIV_DEBUG
+    readers_lock.destroy();
+    if (auto r= readers.load(std::memory_order_relaxed))
+    {
+      ut_ad(r->empty());
+      delete r;
+      readers.store(nullptr, std::memory_order_relaxed);
+    }
+#endif
+    lock.destroy();
+  }
+
+  /** needed for dict_index_t::clone() */
+  inline void operator=(const sux_lock&);
+
+#ifdef UNIV_DEBUG
+  /** @return whether no recursive locks are being held */
+  bool not_recursive() const
+  {
+    ut_ad(recursive);
+    return recursive == RECURSIVE_X || recursive == RECURSIVE_U;
+  }
+
+  /** @return the number of X locks being held (by any thread) */
+  unsigned x_lock_count() const { return recursive & RECURSIVE_MAX; }
+#endif
+
+  /** Acquire a recursive lock */
+  template<bool allow_readers> void writer_recurse()
+  {
+    ut_ad(writer == pthread_self());
+    ut_d(auto rec= (recursive / (allow_readers ? RECURSIVE_U : RECURSIVE_X)) &
+         RECURSIVE_MAX);
+    ut_ad(allow_readers ? recursive : rec);
+    ut_ad(rec < RECURSIVE_MAX);
+    recursive+= allow_readers ? RECURSIVE_U : RECURSIVE_X;
+  }
+
+private:
+  /** Transfer the ownership of a write lock to another thread
+  @param id the new owner of the U or X lock */
+  void set_new_owner(pthread_t id)
+  {
+    IF_DBUG(DBUG_ASSERT(writer.exchange(id, std::memory_order_relaxed)),
+            writer.store(id, std::memory_order_relaxed));
+  }
+  /** Assign the ownership of a write lock to a thread
+  @param id the owner of the U or X lock */
+  void set_first_owner(pthread_t id)
+  {
+    IF_DBUG(DBUG_ASSERT(!writer.exchange(id, std::memory_order_relaxed)),
+            writer.store(id, std::memory_order_relaxed));
+  }
+#ifdef UNIV_DEBUG
+  /** Register the current thread as a holder of a shared lock */
+  void s_lock_register()
+  {
+    const pthread_t id= pthread_self();
+    readers_lock.wr_lock();
+    auto r= readers.load(std::memory_order_relaxed);
+    if (!r)
+    {
+      r= new std::unordered_multiset<pthread_t>();
+      readers.store(r, std::memory_order_relaxed);
+    }
+    r->emplace(id);
+    readers_lock.wr_unlock();
+  }
+#endif
+
+public:
+  /** In crash recovery or the change buffer, claim the ownership
+  of the exclusive block lock to the current thread */
+  void claim_ownership() { set_new_owner(pthread_self()); }
+
+  /** @return whether the current thread is holding X or U latch */
+  bool have_u_or_x() const
+  {
+    if (pthread_self() != writer.load(std::memory_order_relaxed))
+      return false;
+    ut_ad(recursive);
+    return true;
+  }
+  /** @return whether the current thread is holding U but not X latch */
+  bool have_u_not_x() const
+  { return have_u_or_x() && !((recursive / RECURSIVE_X) & RECURSIVE_MAX); }
+  /** @return whether the current thread is holding X latch */
+  bool have_x() const
+  { return have_u_or_x() && ((recursive / RECURSIVE_X) & RECURSIVE_MAX); }
+#ifdef UNIV_DEBUG
+  /** @return whether the current thread is holding S latch */
+  bool have_s() const
+  {
+    if (auto r= readers.load(std::memory_order_relaxed))
+    {
+      readers_lock.wr_lock();
+      bool found= r->find(pthread_self()) != r->end();
+      readers_lock.wr_unlock();
+      return found;
+    }
+    return false;
+  }
+  /** @return whether the current thread is holding the latch */
+  bool have_any() const { return have_u_or_x() || have_s(); }
+#endif
+
+  /** Acquire a shared lock */
+  inline void s_lock();
+  inline void s_lock(const char *file, unsigned line);
+  /** Acquire an update lock */
+  inline void u_lock();
+  inline void u_lock(const char *file, unsigned line);
+  /** Acquire an exclusive lock */
+  inline void x_lock(bool for_io= false);
+  inline void x_lock(const char *file, unsigned line);
+  /** Acquire a recursive exclusive lock */
+  void x_lock_recursive() { writer_recurse<false>(); }
+  /** Upgrade an update lock */
+  inline void u_x_upgrade();
+  inline void u_x_upgrade(const char *file, unsigned line);
+  /** Downgrade a single exclusive lock to an update lock */
+  void x_u_downgrade()
+  {
+    ut_ad(have_u_or_x());
+    ut_ad(recursive <= RECURSIVE_MAX);
+    recursive*= RECURSIVE_U;
+    lock.wr_u_downgrade();
+  }
+
+  /** Acquire an exclusive lock or upgrade an update lock
+  @return whether U locks were upgraded to X */
+  inline bool x_lock_upgraded();
+
+  /** @return whether a shared lock was acquired */
+  bool s_lock_try()
+  {
+    bool acquired= lock.rd_lock_try();
+    ut_d(if (acquired) s_lock_register());
+    return acquired;
+  }
+
+  /** Try to acquire an update lock
+  @param for_io  whether the lock will be released by another thread
+  @return whether the update lock was acquired */
+  inline bool u_lock_try(bool for_io);
+
+  /** Try to acquire an exclusive lock
+  @return whether an exclusive lock was acquired */
+  inline bool x_lock_try();
+
+  /** Release a shared lock */
+  void s_unlock()
+  {
+#ifdef UNIV_DEBUG
+    const pthread_t id= pthread_self();
+    auto r= readers.load(std::memory_order_relaxed);
+    ut_ad(r);
+    readers_lock.wr_lock();
+    auto i= r->find(id);
+    ut_ad(i != r->end());
+    r->erase(i);
+    readers_lock.wr_unlock();
+#endif
+    lock.rd_unlock();
+  }
+  /** Release an update or exclusive lock
+  @param allow_readers    whether we are releasing a U lock
+  @param claim_ownership  whether the lock was acquired by another thread */
+  void u_or_x_unlock(bool allow_readers, bool claim_ownership= false)
+  {
+    ut_d(auto owner= writer.load(std::memory_order_relaxed));
+    ut_ad(owner == pthread_self() ||
+          (owner == FOR_IO && claim_ownership &&
+           recursive == (allow_readers ? RECURSIVE_U : RECURSIVE_X)));
+    ut_d(auto rec= (recursive / (allow_readers ? RECURSIVE_U : RECURSIVE_X)) &
+         RECURSIVE_MAX);
+    ut_ad(rec);
+    if (!(recursive-= allow_readers ? RECURSIVE_U : RECURSIVE_X))
+    {
+      set_new_owner(0);
+      if (allow_readers)
+        lock.u_unlock();
+      else
+        lock.wr_unlock();
+    }
+  }
+  /** Release an update lock */
+  void u_unlock(bool claim_ownership= false)
+  { u_or_x_unlock(true, claim_ownership); }
+  /** Release an exclusive lock */
+  void x_unlock(bool claim_ownership= false)
+  { u_or_x_unlock(false, claim_ownership); }
+
+  /** @return whether any writer is waiting */
+  bool is_waiting() const { return lock.is_waiting(); }
+
+  bool is_write_locked() const { return lock.is_write_locked(); }
+
+  bool is_locked_or_waiting() const { return lock.is_locked_or_waiting(); }
+
+  inline void lock_shared();
+  inline void unlock_shared();
+};
+
+typedef sux_lock<ssux_lock_impl<true>> block_lock;
+
+#ifndef UNIV_PFS_RWLOCK
+typedef sux_lock<ssux_lock_impl<false>> index_lock;
+#else
+typedef sux_lock<ssux_lock> index_lock;
+
+template<> inline void sux_lock<ssux_lock_impl<true>>::init()
+{
+  lock.init();
+  ut_ad(!writer.load(std::memory_order_relaxed));
+  ut_ad(!recursive);
+  ut_d(readers_lock.init());
+#ifdef UNIV_DEBUG
+  if (auto r= readers.load(std::memory_order_relaxed))
+    ut_ad(r->empty());
+#endif
+}
+
+template<>
+inline void sux_lock<ssux_lock>::s_lock(const char *file, unsigned line)
+{
+  ut_ad(!have_x());
+  ut_ad(!have_s());
+  lock.rd_lock(file, line);
+  ut_d(s_lock_register());
+}
+
+template<>
+inline void sux_lock<ssux_lock>::u_lock(const char *file, unsigned line)
+{
+  pthread_t id= pthread_self();
+  if (writer.load(std::memory_order_relaxed) == id)
+    writer_recurse<true>();
+  else
+  {
+    lock.u_lock(file, line);
+    ut_ad(!recursive);
+    recursive= RECURSIVE_U;
+    set_first_owner(id);
+  }
+}
+
+template<>
+inline void sux_lock<ssux_lock>::x_lock(const char *file, unsigned line)
+{
+  pthread_t id= pthread_self();
+  if (writer.load(std::memory_order_relaxed) == id)
+    writer_recurse<false>();
+  else
+  {
+    lock.wr_lock(file, line);
+    ut_ad(!recursive);
+    recursive= RECURSIVE_X;
+    set_first_owner(id);
+  }
+}
+
+template<>
+inline void sux_lock<ssux_lock>::u_x_upgrade(const char *file, unsigned line)
+{
+  ut_ad(have_u_not_x());
+  lock.u_wr_upgrade(file, line);
+  recursive/= RECURSIVE_U;
+}
+#endif
+
+/** needed for dict_index_t::clone() */
+template<> inline void index_lock::operator=(const sux_lock&)
+{
+  memset((void*) this, 0, sizeof *this);
+}
+
+template<typename ssux> inline void sux_lock<ssux>::s_lock()
+{
+  ut_ad(!have_x());
+  ut_ad(!have_s());
+  lock.rd_lock();
+  ut_d(s_lock_register());
+}
+
+template<typename ssux>
+inline void sux_lock<ssux>::lock_shared() { s_lock(); }
+template<typename ssux>
+inline void sux_lock<ssux>::unlock_shared() { s_unlock(); }
+
+template<typename ssux> inline void sux_lock<ssux>::u_lock()
+{
+  pthread_t id= pthread_self();
+  if (writer.load(std::memory_order_relaxed) == id)
+    writer_recurse<true>();
+  else
+  {
+    lock.u_lock();
+    ut_ad(!recursive);
+    recursive= RECURSIVE_U;
+    set_first_owner(id);
+  }
+}
+
+template<typename ssux> inline void sux_lock<ssux>::x_lock(bool for_io)
+{
+  pthread_t id= pthread_self();
+  if (writer.load(std::memory_order_relaxed) == id)
+  {
+    ut_ad(!for_io);
+    writer_recurse<false>();
+  }
+  else
+  {
+    lock.wr_lock();
+    ut_ad(!recursive);
+    recursive= RECURSIVE_X;
+    set_first_owner(for_io ? FOR_IO : id);
+  }
+}
+
+template<typename ssux> inline void sux_lock<ssux>::u_x_upgrade()
+{
+  ut_ad(have_u_not_x());
+  lock.u_wr_upgrade();
+  recursive/= RECURSIVE_U;
+}
+
+template<typename ssux> inline bool sux_lock<ssux>::x_lock_upgraded()
+{
+  pthread_t id= pthread_self();
+  if (writer.load(std::memory_order_relaxed) == id)
+  {
+    ut_ad(recursive);
+    static_assert(RECURSIVE_X == 1, "compatibility");
+    if (recursive & RECURSIVE_MAX)
+    {
+      writer_recurse<false>();
+      return false;
+    }
+    /* Upgrade the lock. */
+    lock.u_wr_upgrade();
+    recursive/= RECURSIVE_U;
+    return true;
+  }
+  else
+  {
+    lock.wr_lock();
+    ut_ad(!recursive);
+    recursive= RECURSIVE_X;
+    set_first_owner(id);
+    return false;
+  }
+}
+
+template<typename ssux> inline bool sux_lock<ssux>::u_lock_try(bool for_io)
+{
+  pthread_t id= pthread_self();
+  if (writer.load(std::memory_order_relaxed) == id)
+  {
+    if (for_io)
+      return false;
+    writer_recurse<true>();
+    return true;
+  }
+  if (lock.u_lock_try())
+  {
+    ut_ad(!recursive);
+    recursive= RECURSIVE_U;
+    set_first_owner(for_io ? FOR_IO : id);
+    return true;
+  }
+  return false;
+}
+
+template<typename ssux> inline bool sux_lock<ssux>::x_lock_try()
+{
+  pthread_t id= pthread_self();
+  if (writer.load(std::memory_order_relaxed) == id)
+  {
+    writer_recurse<false>();
+    return true;
+  }
+  if (lock.wr_lock_try())
+  {
+    ut_ad(!recursive);
+    recursive= RECURSIVE_X;
+    set_first_owner(id);
+    return true;
+  }
+  return false;
+}
diff --git a/storage/innobase/include/transactional_lock_guard.h b/storage/innobase/include/transactional_lock_guard.h
new file mode 100644
index 00000000..168a6897
--- /dev/null
+++ b/storage/innobase/include/transactional_lock_guard.h
@@ -0,0 +1,174 @@
+/*****************************************************************************
+
+Copyright (c) 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+
+#if defined __powerpc64__
+#elif defined __s390__
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64) && !defined(__clang__)
+#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+# if __GNUC__ >= 8
+# elif defined __clang_major__ && __clang_major__ > 6
+# else
+#  define NO_ELISION
+# endif
+#else /* Transactional memory has not been implemented for this ISA */
+# define NO_ELISION
+#endif
+
+#ifdef NO_ELISION
+constexpr bool have_transactional_memory= false;
+# ifdef UNIV_DEBUG
+static inline bool xtest() { return false; }
+# endif
+# define TRANSACTIONAL_TARGET /* nothing */
+# define TRANSACTIONAL_INLINE /* nothing */
+#else
+# if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
+extern bool have_transactional_memory;
+bool transactional_lock_enabled();
+
+#  include <immintrin.h>
+#  if defined __GNUC__ && !defined __INTEL_COMPILER
+#   define TRANSACTIONAL_TARGET __attribute__((target("rtm"),hot))
+#   define TRANSACTIONAL_INLINE __attribute__((target("rtm"),hot,always_inline))
+#  else
+#   define TRANSACTIONAL_TARGET /* nothing */
+#   define TRANSACTIONAL_INLINE /* nothing */
+#  endif
+
+TRANSACTIONAL_INLINE static inline bool xbegin()
+{
+  return have_transactional_memory && _xbegin() == _XBEGIN_STARTED;
+}
+
+#  ifdef UNIV_DEBUG
+#   ifdef __GNUC__
+/** @return whether a memory transaction is active */
+bool xtest();
+#   else
+static inline bool xtest() { return have_transactional_memory && _xtest(); }
+#   endif
+#  endif
+
+TRANSACTIONAL_INLINE static inline void xabort() { _xabort(0); }
+
+TRANSACTIONAL_INLINE static inline void xend() { _xend(); }
+# elif defined __powerpc64__ || defined __s390__
+extern bool have_transactional_memory;
+bool transactional_lock_enabled();
+#   define TRANSACTIONAL_TARGET __attribute__((hot))
+#   define TRANSACTIONAL_INLINE __attribute__((hot,always_inline))
+
+/**
+  Newer gcc compilers only provide __builtin_{htm}
+  functions when the -mhtm CFLAG is actually provided. So
+  we've got the option of including it globally, or
+  pushing down the inclusion of htmxlintrin.h to one
+  file with -mhtm enabled and removing the inline
+  optimization.
+
+  Per FIXME in s390x's htmxlintrin.h, the __TM_simple_begin
+  isn't always_inline resulting in duplicate definitions if
+  it where included more than once.  While xabort and xend
+  could be implemented here, we keep the implementation the
+  same as ppc64.
+ */
+TRANSACTIONAL_TARGET bool xbegin();
+TRANSACTIONAL_TARGET void xabort();
+TRANSACTIONAL_TARGET void xend();
+#  ifdef UNIV_DEBUG
+bool xtest();
+#  endif
+
+# endif
+#endif
+
+template<class mutex>
+class transactional_lock_guard
+{
+  mutex &m;
+
+public:
+  TRANSACTIONAL_INLINE transactional_lock_guard(mutex &m) : m(m)
+  {
+#ifndef NO_ELISION
+    if (xbegin())
+    {
+      if (was_elided())
+        return;
+      xabort();
+    }
+#endif
+    m.lock();
+  }
+  transactional_lock_guard(const transactional_lock_guard &)= delete;
+  TRANSACTIONAL_INLINE ~transactional_lock_guard()
+  {
+#ifndef NO_ELISION
+    if (was_elided()) xend(); else
+#endif
+    m.unlock();
+  }
+
+#ifndef NO_ELISION
+  bool was_elided() const noexcept { return !m.is_locked_or_waiting(); }
+#else
+  bool was_elided() const noexcept { return false; }
+#endif
+};
+
+template<class mutex>
+class transactional_shared_lock_guard
+{
+  mutex &m;
+#ifndef NO_ELISION
+  bool elided;
+#else
+  static constexpr bool elided= false;
+#endif
+
+public:
+  TRANSACTIONAL_INLINE transactional_shared_lock_guard(mutex &m) : m(m)
+  {
+#ifndef NO_ELISION
+    if (xbegin())
+    {
+      if (!m.is_write_locked())
+      {
+        elided= true;
+        return;
+      }
+      xabort();
+    }
+    elided= false;
+#endif
+    m.lock_shared();
+  }
+  transactional_shared_lock_guard(const transactional_shared_lock_guard &)=
+    delete;
+  TRANSACTIONAL_INLINE ~transactional_shared_lock_guard()
+  {
+#ifndef NO_ELISION
+    if (was_elided()) xend(); else
+#endif
+    m.unlock_shared();
+  }
+
+  bool was_elided() const noexcept { return elided; }
+};
diff --git a/storage/innobase/include/trx0i_s.h b/storage/innobase/include/trx0i_s.h
new file mode 100644
index 00000000..caacfa09
--- /dev/null
+++ b/storage/innobase/include/trx0i_s.h
@@ -0,0 +1,277 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0i_s.h
+INFORMATION SCHEMA innodb_trx, innodb_locks and
+innodb_lock_waits tables cache structures and public
+functions.
+
+Created July 17, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef trx0i_s_h
+#define trx0i_s_h
+
+#include "trx0types.h"
+#include "dict0types.h"
+#include "buf0types.h"
+
+/** The maximum amount of memory that can be consumed by innodb_trx,
+innodb_locks and innodb_lock_waits information schema tables. */
+#define TRX_I_S_MEM_LIMIT		16777216 /* 16 MiB */
+
+/** The maximum length of a string that can be stored in
+i_s_locks_row_t::lock_data */
+#define TRX_I_S_LOCK_DATA_MAX_LEN	8192
+
+/** The maximum length of a string that can be stored in
+i_s_trx_row_t::trx_query */
+#define TRX_I_S_TRX_QUERY_MAX_LEN	1024
+
+/** The maximum length of a string that can be stored in
+i_s_trx_row_t::trx_foreign_key_error */
+#define TRX_I_S_TRX_FK_ERROR_MAX_LEN	256
+
+/** Safely copy strings in to the INNODB_TRX table's
+string based columns */
+#define TRX_I_S_STRING_COPY(data, field, constraint, tcache)	\
+do {								\
+	if (strlen(data) > constraint) {			\
+		char	buff[constraint + 1];			\
+		strncpy(buff, data, constraint);		\
+		buff[constraint] = '\0';			\
+								\
+		field = static_cast<const char*>(		\
+			ha_storage_put_memlim(			\
+			(tcache)->storage, buff, constraint + 1,\
+			MAX_ALLOWED_FOR_STORAGE(tcache)));	\
+	} else {						\
+		field = static_cast<const char*>(		\
+			ha_storage_put_str_memlim(		\
+			(tcache)->storage, data,		\
+			MAX_ALLOWED_FOR_STORAGE(tcache)));	\
+	}							\
+} while (0)
+
+/** A row of INFORMATION_SCHEMA.innodb_locks */
+struct i_s_locks_row_t;
+
+/** Objects of trx_i_s_cache_t::locks_hash */
+struct i_s_hash_chain_t;
+
+/** Objects of this type are added to the hash table
+trx_i_s_cache_t::locks_hash */
+struct i_s_hash_chain_t {
+	i_s_locks_row_t*	value;	/*!< row of
+					INFORMATION_SCHEMA.innodb_locks*/
+	i_s_hash_chain_t*	next;	/*!< next item in the hash chain */
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_locks row */
+struct i_s_locks_row_t {
+	trx_id_t	lock_trx_id;	/*!< transaction identifier */
+	const char*	lock_table;	/*!< table name from
+					lock_get_table_name() */
+	/** index name of a record lock; NULL for table locks */
+	const char*	lock_index;
+	/** page identifier of the record; (0,0) if !lock_index */
+	page_id_t	lock_page;
+	/** heap number of the record; 0 if !lock_index */
+	uint16_t	lock_rec;
+	/** lock mode corresponding to lock_mode_values_typelib */
+	uint8_t		lock_mode;
+	/** (some) content of the record, if available in the buffer pool;
+	NULL if !lock_index */
+	const char*	lock_data;
+
+	/** The following are auxiliary and not included in the table */
+	/* @{ */
+	table_id_t	lock_table_id;
+					/*!< table identifier from
+					lock_get_table_id */
+	i_s_hash_chain_t hash_chain;	/*!< hash table chain node for
+					trx_i_s_cache_t::locks_hash */
+	/* @} */
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_trx row */
+struct i_s_trx_row_t {
+	trx_id_t		trx_id;		/*!< transaction identifier */
+	const char*		trx_state;
+	time_t			trx_started;	/*!< trx_t::start_time */
+	const i_s_locks_row_t*	requested_lock_row;
+					/*!< pointer to a row
+					in innodb_locks if trx
+					is waiting, or NULL */
+	time_t		trx_wait_started; /*!< trx_t->lock.wait_started */
+	uintmax_t	trx_weight;	/*!< TRX_WEIGHT() */
+	ulint		trx_mysql_thread_id; /*!< thd_get_thread_id() */
+	const char*	trx_query;	/*!< MySQL statement being
+					executed in the transaction */
+	CHARSET_INFO*	trx_query_cs;	/*!< the charset of trx_query */
+	const char*	trx_operation_state; /*!< trx_t::op_info */
+	ulint		trx_tables_in_use;/*!< n_mysql_tables_in_use in
+					 trx_t */
+	ulint		trx_tables_locked;
+					/*!< mysql_n_tables_locked in
+					trx_t */
+	ulint		trx_lock_structs;/*!< list len of trx_locks in
+					trx_t */
+	ulint		trx_lock_memory_bytes;
+					/*!< mem_heap_get_size(
+					trx->lock_heap) */
+	ulint		trx_rows_locked;/*!< trx_lock_t::n_rec_locks */
+	uintmax_t	trx_rows_modified;/*!< trx_t::undo_no */
+	uint		trx_isolation_level;
+					/*!< trx_t::isolation_level */
+	bool		trx_unique_checks;
+					/*!< check_unique_secondary in trx_t*/
+	bool		trx_foreign_key_checks;
+					/*!< check_foreigns in trx_t */
+	const char*	trx_foreign_key_error;
+					/*!< detailed_error in trx_t */
+	bool		trx_is_read_only;
+					/*!< trx_t::read_only */
+	bool		trx_is_autocommit_non_locking;
+					/*!< trx:t::is_autocommit_non_locking()
+					*/
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_lock_waits row */
+struct i_s_lock_waits_row_t {
+	const i_s_locks_row_t*	requested_lock_row;	/*!< requested lock */
+	const i_s_locks_row_t*	blocking_lock_row;	/*!< blocking lock */
+};
+
+/** Cache of INFORMATION_SCHEMA table data */
+struct trx_i_s_cache_t;
+
+/** Auxiliary enum used by functions that need to select one of the
+INFORMATION_SCHEMA tables */
+enum i_s_table {
+	I_S_INNODB_TRX,		/*!< INFORMATION_SCHEMA.innodb_trx */
+	I_S_INNODB_LOCKS,	/*!< INFORMATION_SCHEMA.innodb_locks */
+	I_S_INNODB_LOCK_WAITS	/*!< INFORMATION_SCHEMA.innodb_lock_waits */
+};
+
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+extern trx_i_s_cache_t*	trx_i_s_cache;
+
+/*******************************************************************//**
+Initialize INFORMATION SCHEMA trx related cache. */
+void
+trx_i_s_cache_init(
+/*===============*/
+	trx_i_s_cache_t*	cache);	/*!< out: cache to init */
+/*******************************************************************//**
+Free the INFORMATION SCHEMA trx related cache. */
+void
+trx_i_s_cache_free(
+/*===============*/
+	trx_i_s_cache_t*	cache);	/*!< in/out: cache to free */
+
+/*******************************************************************//**
+Issue a shared/read lock on the tables cache. */
+void
+trx_i_s_cache_start_read(
+/*=====================*/
+	trx_i_s_cache_t*	cache);	/*!< in: cache */
+
+/*******************************************************************//**
+Release a shared/read lock on the tables cache. */
+void
+trx_i_s_cache_end_read(
+/*===================*/
+	trx_i_s_cache_t*	cache);	/*!< in: cache */
+
+/*******************************************************************//**
+Issue an exclusive/write lock on the tables cache. */
+void
+trx_i_s_cache_start_write(
+/*======================*/
+	trx_i_s_cache_t*	cache);	/*!< in: cache */
+
+/*******************************************************************//**
+Release an exclusive/write lock on the tables cache. */
+void
+trx_i_s_cache_end_write(
+/*====================*/
+	trx_i_s_cache_t*	cache);	/*!< in: cache */
+
+
+/*******************************************************************//**
+Retrieves the number of used rows in the cache for a given
+INFORMATION SCHEMA table.
+@return number of rows */
+ulint
+trx_i_s_cache_get_rows_used(
+/*========================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	enum i_s_table		table);	/*!< in: which table */
+
+/*******************************************************************//**
+Retrieves the nth row in the cache for a given INFORMATION SCHEMA
+table.
+@return row */
+void*
+trx_i_s_cache_get_nth_row(
+/*======================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	enum i_s_table		table,	/*!< in: which table */
+	ulint			n);	/*!< in: row number */
+
+/*******************************************************************//**
+Update the transactions cache if it has not been read for some time.
+@return 0 - fetched, 1 - not */
+int
+trx_i_s_possibly_fetch_data_into_cache(
+/*===================================*/
+	trx_i_s_cache_t*	cache);	/*!< in/out: cache */
+
+/*******************************************************************//**
+Returns true, if the data in the cache is truncated due to the memory
+limit posed by TRX_I_S_MEM_LIMIT.
+@return TRUE if truncated */
+bool
+trx_i_s_cache_is_truncated(
+/*=======================*/
+	trx_i_s_cache_t*	cache);	/*!< in: cache */
+/** The maximum length of a resulting lock_id_size in
+trx_i_s_create_lock_id(), not including the terminating NUL.
+":%lu:%lu:%lu" -> 63 chars */
+#define TRX_I_S_LOCK_ID_MAX_LEN	(TRX_ID_MAX_LEN + 63)
+
+/*******************************************************************//**
+Crafts a lock id string from a i_s_locks_row_t object. Returns its
+second argument. This function aborts if there is not enough space in
+lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you
+want to be 100% sure that it will not abort.
+@return resulting lock id */
+char*
+trx_i_s_create_lock_id(
+/*===================*/
+	const i_s_locks_row_t*	row,	/*!< in: innodb_locks row */
+	char*			lock_id,/*!< out: resulting lock_id */
+	ulint			lock_id_size);/*!< in: size of the lock id
+					buffer */
+
+#endif /* trx0i_s_h */
diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h
new file mode 100644
index 00000000..3ddd2e98
--- /dev/null
+++ b/storage/innobase/include/trx0purge.h
@@ -0,0 +1,427 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0purge.h
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "trx0sys.h"
+#include "que0types.h"
+#include "srw_lock.h"
+
+#include <queue>
+#include <unordered_map>
+
+/** Prepend the history list with an undo log.
+Remove the undo log segment from the rseg slot if it is too big for reuse.
+@param[in]	trx		transaction
+@param[in,out]	undo		undo log
+@param[in,out]	mtr		mini-transaction */
+void
+trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr);
+
+/**
+Remove unnecessary history data from rollback segments. NOTE that when this
+function is called, the caller (purge_coordinator_callback)
+must not have any latches on undo log pages!
+*/
+void trx_purge_truncate_history();
+
+/**
+Run a purge batch.
+@param n_tasks       number of purge tasks to submit to the queue
+@param history_size  trx_sys.history_size()
+@return number of undo log pages handled in the batch */
+ulint trx_purge(ulint n_tasks, ulint history_size);
+
+/** Rollback segements from a given transaction with trx-no
+scheduled for purge. */
+class TrxUndoRsegs {
+private:
+	typedef std::vector<trx_rseg_t*, ut_allocator<trx_rseg_t*> >
+		trx_rsegs_t;
+public:
+	typedef trx_rsegs_t::iterator iterator;
+	typedef trx_rsegs_t::const_iterator const_iterator;
+
+	TrxUndoRsegs() = default;
+
+	/** Constructor */
+	TrxUndoRsegs(trx_rseg_t& rseg)
+		: trx_no(rseg.last_trx_no()), m_rsegs(1, &rseg) {}
+	/** Constructor */
+	TrxUndoRsegs(trx_id_t trx_no, trx_rseg_t& rseg)
+		: trx_no(trx_no), m_rsegs(1, &rseg) {}
+
+	bool operator!=(const TrxUndoRsegs& other) const
+	{ return trx_no != other.trx_no; }
+	bool empty() const { return m_rsegs.empty(); }
+	void erase(iterator& it) { m_rsegs.erase(it); }
+	iterator begin() { return(m_rsegs.begin()); }
+	iterator end() { return(m_rsegs.end()); }
+	const_iterator begin() const { return m_rsegs.begin(); }
+	const_iterator end() const { return m_rsegs.end(); }
+
+	/** Compare two TrxUndoRsegs based on trx_no.
+	@param elem1 first element to compare
+	@param elem2 second element to compare
+	@return true if elem1 > elem2 else false.*/
+	bool operator()(const TrxUndoRsegs& lhs, const TrxUndoRsegs& rhs)
+	{
+		return(lhs.trx_no > rhs.trx_no);
+	}
+
+	/** Copy of trx_rseg_t::last_trx_no() */
+	trx_id_t trx_no= 0;
+private:
+	/** Rollback segments of a transaction, scheduled for purge. */
+	trx_rsegs_t m_rsegs{};
+};
+
+typedef std::priority_queue<
+	TrxUndoRsegs,
+	std::vector<TrxUndoRsegs, ut_allocator<TrxUndoRsegs> >,
+	TrxUndoRsegs>	purge_pq_t;
+
+/** Chooses the rollback segment with the oldest committed transaction */
+struct TrxUndoRsegsIterator {
+	/** Constructor */
+	TrxUndoRsegsIterator();
+	/** Sets the next rseg to purge in purge_sys.
+	Executed in the purge coordinator thread.
+	@retval false when nothing is to be purged
+	@retval true  when purge_sys.rseg->latch was locked */
+	inline bool set_next();
+
+private:
+	// Disable copying
+	TrxUndoRsegsIterator(const TrxUndoRsegsIterator&);
+	TrxUndoRsegsIterator& operator=(const TrxUndoRsegsIterator&);
+
+	/** The current element to process */
+	TrxUndoRsegs			m_rsegs;
+	/** Track the current element in m_rsegs */
+	TrxUndoRsegs::const_iterator	m_iter;
+};
+
+/** The control structure used in the purge operation */
+class purge_sys_t
+{
+  friend TrxUndoRsegsIterator;
+public:
+  /** latch protecting view, m_enabled */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) mutable srw_spin_lock latch;
+private:
+  /** Read view at the start of a purge batch. Any encountered index records
+  that are older than view will be removed. */
+  ReadViewBase view;
+  /** whether the subsystem has been initialized */
+  bool m_initialized{false};
+  /** whether purge is enabled; protected by latch and std::atomic */
+  std::atomic<bool> m_enabled{false};
+public:
+  /** whether purge is active (may hold table handles) */
+  std::atomic<bool> m_active{false};
+private:
+  /** number of pending stop() calls without resume() */
+  Atomic_counter<uint32_t> m_paused;
+  /** number of stop_SYS() calls without resume_SYS() */
+  Atomic_counter<uint32_t> m_SYS_paused;
+  /** number of stop_FTS() calls without resume_FTS() */
+  Atomic_counter<uint32_t> m_FTS_paused;
+
+  /** latch protecting end_view */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_spin_lock_low end_latch;
+  /** Read view at the end of a purge batch (copied from view). Any undo pages
+  containing records older than end_view may be freed. */
+  ReadViewBase end_view;
+
+  struct hasher
+  {
+    size_t operator()(const page_id_t &id) const { return size_t(id.raw()); }
+  };
+
+  using unordered_map =
+    std::unordered_map<const page_id_t, buf_block_t*, hasher,
+#if defined __GNUC__ && __GNUC__ == 4 && __GNUC_MINOR__ >= 8
+                       std::equal_to<page_id_t>
+                       /* GCC 4.8.5 would fail to find a matching allocator */
+#else
+                       std::equal_to<page_id_t>,
+                       ut_allocator<std::pair<const page_id_t, buf_block_t*>>
+#endif
+                       >;
+  /** map of buffer-fixed undo log pages processed during a purge batch */
+  unordered_map pages;
+public:
+  /** @return the number of processed undo pages */
+  size_t n_pages_handled() const { return pages.size(); }
+
+  /** Look up an undo log page.
+  @param id    undo page identifier
+  @return undo page
+  @retval nullptr in case the page is corrupted */
+  buf_block_t *get_page(page_id_t id);
+
+	que_t*		query;		/*!< The query graph which will do the
+					parallelized purge operation */
+
+	/** Iterator to the undo log records of committed transactions */
+	struct iterator
+	{
+		bool operator<=(const iterator& other) const
+		{
+			if (trx_no < other.trx_no) return true;
+			if (trx_no > other.trx_no) return false;
+			return undo_no <= other.undo_no;
+		}
+
+		/** Free the undo pages up to this. */
+		dberr_t free_history() const;
+
+		/** trx_t::no of the committed transaction */
+		trx_id_t	trx_no;
+		/** The record number within the committed transaction's undo
+		log, increasing, purged from from 0 onwards */
+		undo_no_t	undo_no;
+	};
+
+	/** The tail of the purge queue; the last parsed undo log of a
+	committed transaction. */
+	iterator	tail;
+	/** The head of the purge queue; any older undo logs of committed
+	transactions may be discarded (history list truncation).
+	Protected by latch. */
+	iterator	head;
+	/*-----------------------------*/
+	bool		next_stored;	/*!< whether rseg holds the next record
+					to purge */
+	trx_rseg_t*	rseg;		/*!< Rollback segment for the next undo
+					record to purge */
+private:
+	uint32_t	page_no;	/*!< Page number for the next undo
+					record to purge, page number of the
+					log header, if dummy record */
+	uint32_t	hdr_page_no;	/*!< Header page of the undo log where
+					the next record to purge belongs */
+	uint16_t	offset;		/*!< Page offset for the next undo
+					record to purge, 0 if the dummy
+					record */
+	uint16_t	hdr_offset;	/*!< Header byte offset on the page */
+
+
+	TrxUndoRsegsIterator
+			rseg_iter;	/*!< Iterator to get the next rseg
+					to process */
+public:
+	purge_pq_t	purge_queue;	/*!< Binary min-heap, ordered on
+					TrxUndoRsegs::trx_no. It is protected
+					by the pq_mutex */
+	mysql_mutex_t	pq_mutex;	/*!< Mutex protecting purge_queue */
+
+	/** Undo tablespace file truncation (only accessed by the
+	srv_purge_coordinator_thread) */
+	struct {
+		/** The undo tablespace that is currently being truncated */
+		fil_space_t*	current;
+		/** The undo tablespace that was last truncated */
+		fil_space_t*	last;
+	} truncate;
+
+  /** Create the instance */
+  void create();
+
+  /** Close the purge system on shutdown */
+  void close();
+
+  /** @return whether purge is enabled */
+  bool enabled() { return m_enabled.load(std::memory_order_relaxed); }
+  /** @return whether the purge coordinator is paused */
+  bool paused()
+  { return m_paused != 0; }
+
+  /** Enable purge at startup. */
+  void coordinator_startup()
+  {
+    ut_ad(!enabled());
+    m_enabled.store(true, std::memory_order_relaxed);
+    wake_if_not_active();
+  }
+
+  /** Disable purge at shutdown */
+  void coordinator_shutdown()
+  {
+    ut_ad(enabled());
+    m_enabled.store(false, std::memory_order_relaxed);
+  }
+
+  /** @return whether the purge tasks are active */
+  static bool running();
+
+  /** Stop purge during FLUSH TABLES FOR EXPORT. */
+  void stop();
+  /** Resume purge at UNLOCK TABLES after FLUSH TABLES FOR EXPORT */
+  void resume();
+
+  /** Close and reopen all tables in case of a MDL conflict with DDL */
+  dict_table_t *close_and_reopen(table_id_t id, THD *thd, MDL_ticket **mdl);
+private:
+  /** Suspend purge during a DDL operation on FULLTEXT INDEX tables */
+  void wait_FTS(bool also_sys);
+public:
+  /** Suspend purge in data dictionary tables */
+  void stop_SYS() { m_SYS_paused++; }
+  /** Resume purge in data dictionary tables */
+  static void resume_SYS(void *);
+
+  /** Pause purge during a DDL operation that could drop FTS_ tables. */
+  void stop_FTS();
+  /** Resume purge after stop_FTS(). */
+  void resume_FTS() { ut_d(const auto p=) m_FTS_paused--; ut_ad(p); }
+  /** @return whether stop_SYS() is in effect */
+  bool must_wait_FTS() const { return m_FTS_paused; }
+
+private:
+  /**
+  Get the next record to purge and update the info in the purge system.
+  @param roll_ptr           undo log pointer to the record
+  @return buffer-fixed reference to undo log record
+  @retval {nullptr,1} if the whole undo log can skipped in purge
+  @retval {nullptr,0} if nothing is left, or on corruption */
+  inline trx_purge_rec_t get_next_rec(roll_ptr_t roll_ptr);
+
+  /** Choose the next undo log to purge.
+  @return whether anything is to be purged */
+  bool choose_next_log();
+
+  /** Update the last not yet purged history log info in rseg when
+  we have purged a whole undo log. Advances also purge_trx_no
+  past the purged log. */
+  void rseg_get_next_history_log();
+
+public:
+  /**
+  Fetch the next undo log record from the history list to purge.
+  @return buffer-fixed reference to undo log record
+  @retval {nullptr,1} if the whole undo log can skipped in purge
+  @retval {nullptr,0} if nothing is left, or on corruption */
+  inline trx_purge_rec_t fetch_next_rec();
+
+  /** Determine if the history of a transaction is purgeable.
+  @param trx_id  transaction identifier
+  @return whether the history is purgeable */
+  TRANSACTIONAL_TARGET bool is_purgeable(trx_id_t trx_id) const;
+
+  /** A wrapper around ReadView::low_limit_no(). */
+  trx_id_t low_limit_no() const
+  {
+    /* This function may only be called by purge_coordinator_callback().
+
+    The purge coordinator task may call this without holding any latch,
+    because it is the only thread that may modify purge_sys.view.
+
+    Any other threads that access purge_sys.view must hold purge_sys.latch,
+    typically via purge_sys_t::view_guard. */
+    return view.low_limit_no();
+  }
+  /** A wrapper around ReadView::sees(). */
+  trx_id_t sees(trx_id_t id) const
+  {
+    /* This function may only be called by purge_coordinator_callback().
+
+    The purge coordinator task may call this without holding any latch,
+    because it is the only thread that may modify purge_sys.view.
+
+    Any other threads that access purge_sys.view must hold purge_sys.latch,
+    typically via purge_sys_t::view_guard. */
+    return view.sees(id);
+  }
+  /** A wrapper around trx_sys_t::clone_oldest_view(). */
+  template<bool also_end_view= false>
+  void clone_oldest_view()
+  {
+    if (!also_end_view)
+      wait_FTS(true);
+    latch.wr_lock(SRW_LOCK_CALL);
+    trx_sys.clone_oldest_view(&view);
+    if (also_end_view)
+      (end_view= view).
+        clamp_low_limit_id(head.trx_no ? head.trx_no : tail.trx_no);
+    latch.wr_unlock();
+  }
+
+  /** Wake up the purge threads if there is work to do. */
+  void wake_if_not_active();
+
+  /** Release undo pages and update end_view at the end of a purge batch.
+  @retval false when nothing is to be purged
+  @retval true  when purge_sys.rseg->latch was locked  */
+  inline void batch_cleanup(const iterator &head);
+
+  struct view_guard
+  {
+    inline view_guard();
+    inline ~view_guard();
+
+    /** @return purge_sys.view */
+    inline const ReadViewBase &view() const;
+  };
+
+  struct end_view_guard
+  {
+    inline end_view_guard();
+    inline ~end_view_guard();
+
+    /** @return purge_sys.end_view */
+    inline const ReadViewBase &view() const;
+  };
+
+  /** Stop the purge thread and check n_ref_count of all auxiliary
+  and common table associated with the fts table.
+  @param	table		parent FTS table
+  @param	already_stopped	True indicates purge threads were
+				already stopped */
+  void stop_FTS(const dict_table_t &table, bool already_stopped=false);
+};
+
+/** The global data structure coordinating a purge */
+extern purge_sys_t	purge_sys;
+
+purge_sys_t::view_guard::view_guard()
+{ purge_sys.latch.rd_lock(SRW_LOCK_CALL); }
+
+purge_sys_t::view_guard::~view_guard()
+{ purge_sys.latch.rd_unlock(); }
+
+const ReadViewBase &purge_sys_t::view_guard::view() const
+{ return purge_sys.view; }
+
+purge_sys_t::end_view_guard::end_view_guard()
+{ purge_sys.end_latch.rd_lock(); }
+
+purge_sys_t::end_view_guard::~end_view_guard()
+{ purge_sys.end_latch.rd_unlock(); }
+
+const ReadViewBase &purge_sys_t::end_view_guard::view() const
+{ return purge_sys.end_view; }
diff --git a/storage/innobase/include/trx0rec.h b/storage/innobase/include/trx0rec.h
new file mode 100644
index 00000000..3d9b1868
--- /dev/null
+++ b/storage/innobase/include/trx0rec.h
@@ -0,0 +1,299 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rec.h
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "trx0types.h"
+#include "row0types.h"
+#include "page0types.h"
+#include "que0types.h"
+
+/**********************************************************************//**
+Reads the undo log record number.
+@return undo no */
+inline undo_no_t trx_undo_rec_get_undo_no(const trx_undo_rec_t *undo_rec)
+{
+  return mach_u64_read_much_compressed(undo_rec + 3);
+}
+
+/**********************************************************************//**
+Returns the start of the undo record data area. */
+#define trx_undo_rec_get_ptr(undo_rec, undo_no)		\
+	((undo_rec) + trx_undo_rec_get_offset(undo_no))
+
+/**********************************************************************//**
+Reads from an undo log record the general parameters.
+@return remaining part of undo log record after reading these values */
+const byte*
+trx_undo_rec_get_pars(
+/*==================*/
+	const trx_undo_rec_t*	undo_rec,	/*!< in: undo log record */
+	byte*		type,		/*!< out: undo record type:
+					TRX_UNDO_INSERT_REC, ... */
+	byte*		cmpl_info,	/*!< out: compiler info, relevant only
+					for update type records */
+	bool*		updated_extern,	/*!< out: true if we updated an
+					externally stored fild */
+	undo_no_t*	undo_no,	/*!< out: undo log record number */
+	table_id_t*	table_id)	/*!< out: table id */
+	MY_ATTRIBUTE((nonnull));
+
+/*******************************************************************//**
+Builds a row reference from an undo log record.
+@return pointer to remaining part of undo record */
+const byte*
+trx_undo_rec_get_row_ref(
+/*=====================*/
+	const byte*	ptr,	/*!< in: remaining part of a copy of an undo log
+				record, at the start of the row reference;
+				NOTE that this copy of the undo log record must
+				be preserved as long as the row reference is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const dtuple_t**ref,	/*!< out, own: row reference */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
+				needed is allocated */
+	MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Reads from an undo log update record the system field values of the old
+version.
+@return remaining part of undo log record after reading these values */
+byte*
+trx_undo_update_rec_get_sys_cols(
+/*=============================*/
+	const byte*	ptr,		/*!< in: remaining part of undo
+					log record after reading
+					general parameters */
+	trx_id_t*	trx_id,		/*!< out: trx id */
+	roll_ptr_t*	roll_ptr,	/*!< out: roll ptr */
+	byte*		info_bits);	/*!< out: info bits state */
+/*******************************************************************//**
+Builds an update vector based on a remaining part of an undo log record.
+@return remaining part of the record, NULL if an error detected, which
+means that the record is corrupted */
+byte*
+trx_undo_update_rec_get_update(
+/*===========================*/
+	const byte*	ptr,	/*!< in: remaining part in update undo log
+				record, after reading the row reference
+				NOTE that this copy of the undo log record must
+				be preserved as long as the update vector is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		type,	/*!< in: TRX_UNDO_UPD_EXIST_REC,
+				TRX_UNDO_UPD_DEL_REC, or
+				TRX_UNDO_DEL_MARK_REC; in the last case,
+				only trx id and roll ptr fields are added to
+				the update vector */
+	trx_id_t	trx_id,	/*!< in: transaction id from this undorecord */
+	roll_ptr_t	roll_ptr,/*!< in: roll pointer from this undo record */
+	byte		info_bits,/*!< in: info bits from this undo record */
+	mem_heap_t*	heap,	/*!< in: memory heap from which the memory
+				needed is allocated */
+	upd_t**		upd);	/*!< out, own: update vector */
+/** Report a RENAME TABLE operation.
+@param[in,out]	trx	transaction
+@param[in]	table	table that is being renamed
+@return	DB_SUCCESS or error code */
+dberr_t trx_undo_report_rename(trx_t* trx, const dict_table_t* table)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************************//**
+Writes information to an undo log about an insert, update, or a delete marking
+of a clustered index record. This information is used in a rollback of the
+transaction and in consistent reads that must look to the history of this
+transaction.
+@return DB_SUCCESS or error code */
+dberr_t
+trx_undo_report_row_operation(
+/*==========================*/
+	que_thr_t*	thr,		/*!< in: query thread */
+	dict_index_t*	index,		/*!< in: clustered index */
+	const dtuple_t*	clust_entry,	/*!< in: in the case of an insert,
+					index entry to insert into the
+					clustered index; in updates,
+					may contain a clustered index
+					record tuple that also contains
+					virtual columns of the table;
+					otherwise, NULL */
+	const upd_t*	update,		/*!< in: in the case of an update,
+					the update vector, otherwise NULL */
+	ulint		cmpl_info,	/*!< in: compiler info on secondary
+					index updates */
+	const rec_t*	rec,		/*!< in: case of an update or delete
+					marking, the record in the clustered
+					index; NULL if insert */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec) */
+	roll_ptr_t*	roll_ptr)	/*!< out: DB_ROLL_PTR to the
+					undo log record */
+	MY_ATTRIBUTE((nonnull(1,2), warn_unused_result));
+
+/** status bit used for trx_undo_prev_version_build() */
+
+/** TRX_UNDO_PREV_IN_PURGE tells trx_undo_prev_version_build() that it
+is being called purge view and we would like to get the purge record
+even it is in the purge view (in normal case, it will return without
+fetching the purge record */
+static constexpr ulint TRX_UNDO_PREV_IN_PURGE = 1;
+
+/** This tells trx_undo_prev_version_build() to fetch the old value in
+the undo log (which is the after image for an update) */
+static constexpr ulint TRX_UNDO_GET_OLD_V_VALUE = 2;
+
+/** indicate a call from row_vers_old_has_index_entry() */
+static constexpr ulint TRX_UNDO_CHECK_PURGEABILITY = 4;
+
+/** Build a previous version of a clustered index record. The caller
+must hold a latch on the index page of the clustered index record.
+@param	rec		version of a clustered index record
+@param	index		clustered index
+@param	offsets		rec_get_offsets(rec, index)
+@param	heap		memory heap from which the memory needed is
+			allocated
+@param	old_vers	previous version or NULL if rec is the
+			first inserted version, or if history data
+			has been deleted (an error), or if the purge
+			could have removed the version
+			though it has not yet done so
+@param	v_heap		memory heap used to create vrow
+			dtuple if it is not yet created. This heap
+                        diffs from "heap" above in that it could be
+                        prebuilt->old_vers_heap for selection
+@param	vrow		virtual column info, if any
+@param	v_status	status determine if it is going into this
+			function by purge thread or not.
+			And if we read "after image" of undo log
+@return error code
+@retval DB_SUCCESS if previous version was successfully built,
+or if it was an insert or the undo record refers to the table before rebuild
+@retval DB_MISSING_HISTORY if the history is missing */
+dberr_t
+trx_undo_prev_version_build(
+	const rec_t 	*rec,
+	dict_index_t	*index,
+	rec_offs	*offsets,
+	mem_heap_t	*heap,
+	rec_t		**old_vers,
+	mem_heap_t	*v_heap,
+	dtuple_t	**vrow,
+	ulint		v_status);
+
+/** Read from an undo log record a non-virtual column value.
+@param ptr	pointer to remaining part of the undo record
+@param field	stored field
+@param len	length of the field, or UNIV_SQL_NULL
+@param orig_len	original length of the locally stored part
+of an externally stored column, or 0
+@return remaining part of undo log record after reading these values */
+const byte *trx_undo_rec_get_col_val(const byte *ptr, const byte **field,
+                                     uint32_t *len, uint32_t *orig_len);
+
+/** Read virtual column value from undo log
+@param[in]	table		the table
+@param[in]	ptr		undo log pointer
+@param[in,out]	row		the dtuple to fill
+@param[in]	in_purge	whether this is called by purge */
+void
+trx_undo_read_v_cols(
+	const dict_table_t*	table,
+	const byte*		ptr,
+	dtuple_t*		row,
+	bool			in_purge);
+
+/** Read virtual column index from undo log if the undo log contains such
+info, and verify the column is still indexed, and output its position
+@param[in]	table		the table
+@param[in]	ptr		undo log pointer
+@param[in]	first_v_col	if this is the first virtual column, which
+				has the version marker
+@param[in,out]	is_undo_log	his function is used to parse both undo log,
+				and online log for virtual columns. So
+				check to see if this is undo log
+@param[out]	field_no	the column number, or FIL_NULL if not indexed
+@return remaining part of undo log record after reading these values */
+const byte*
+trx_undo_read_v_idx(
+	const dict_table_t*	table,
+	const byte*		ptr,
+	bool			first_v_col,
+	bool*			is_undo_log,
+	uint32_t*		field_no);
+
+/* Types of an undo log record: these have to be smaller than 16, as the
+compilation info multiplied by 16 is ORed to this value in an undo log
+record */
+
+/** Undo log records for DDL operations
+
+Note: special rollback and purge triggers exist for SYS_INDEXES records:
+@see dict_drop_index_tree() */
+enum trx_undo_ddl_type
+{
+  /** RENAME TABLE (logging the old table name).
+
+  Because SYS_TABLES has PRIMARY KEY(NAME), the row-level undo log records
+  for SYS_TABLES cannot be distinguished from DROP TABLE, CREATE TABLE. */
+  TRX_UNDO_RENAME_TABLE= 9,
+  /** insert a metadata pseudo-record for instant ALTER TABLE */
+  TRX_UNDO_INSERT_METADATA= 10
+};
+
+/* DML operations */
+#define	TRX_UNDO_INSERT_REC	11	/* fresh insert into clustered index */
+#define	TRX_UNDO_UPD_EXIST_REC	12	/* update of a non-delete-marked
+					record */
+#define	TRX_UNDO_UPD_DEL_REC	13	/* update of a delete marked record to
+					a not delete marked record; also the
+					fields of the record can change */
+#define	TRX_UNDO_DEL_MARK_REC	14	/* delete marking of a record; fields
+					do not change */
+/** Bulk insert operation. It is written only when the table is
+under exclusive lock and the clustered index root page latch is being held,
+and the clustered index is empty. Rollback will empty the table and
+free the leaf segment of all indexes, re-create the new
+leaf segment and re-initialize the root page alone. */
+#define	TRX_UNDO_EMPTY		15
+
+#define	TRX_UNDO_CMPL_INFO_MULT	16U	/* compilation info is multiplied by
+					this and ORed to the type above */
+#define	TRX_UNDO_UPD_EXTERN	128U	/* This bit can be ORed to type_cmpl
+					to denote that we updated external
+					storage fields: used by purge to
+					free the external storage */
+
+/** The search tuple corresponding to TRX_UNDO_INSERT_METADATA */
+extern const dtuple_t trx_undo_metadata;
+
+/** Read the table id from an undo log record.
+@param[in]      rec        Undo log record
+@return table id stored as a part of undo log record */
+inline table_id_t trx_undo_rec_get_table_id(const trx_undo_rec_t *rec)
+{
+  rec+= 3;
+  mach_read_next_much_compressed(&rec);
+  return mach_read_next_much_compressed(&rec);
+}
diff --git a/storage/innobase/include/trx0roll.h b/storage/innobase/include/trx0roll.h
new file mode 100644
index 00000000..9ef9ebe9
--- /dev/null
+++ b/storage/innobase/include/trx0roll.h
@@ -0,0 +1,168 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0roll.h
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0roll_h
+#define trx0roll_h
+
+#include "trx0trx.h"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+
+extern bool		trx_rollback_is_active;
+extern const trx_t*	trx_roll_crash_recv_trx;
+
+/** Report progress when rolling back a row of a recovered transaction. */
+void trx_roll_report_progress();
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery.  If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+@param all true=roll back all recovered active transactions;
+false=roll back any incomplete dictionary transaction */
+void
+trx_rollback_recovered(bool all);
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery.  If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+Note: this is done in a background thread. */
+void trx_rollback_all_recovered(void*);
+/*********************************************************************//**
+Creates a rollback command node struct.
+@return own: rollback node struct */
+roll_node_t*
+roll_node_create(
+/*=============*/
+	mem_heap_t*	heap);	/*!< in: mem heap where created */
+/***********************************************************//**
+Performs an execution step for a rollback command node in a query graph.
+@return query thread to run next, or NULL */
+que_thr_t*
+trx_rollback_step(
+/*==============*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+trx_rollback_for_mysql(
+/*===================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+	MY_ATTRIBUTE((nonnull));
+/*******************************************************************//**
+Rollback the latest SQL statement for MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+trx_rollback_last_sql_stat_for_mysql(
+/*=================================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+	MY_ATTRIBUTE((nonnull));
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+dberr_t
+trx_rollback_to_savepoint_for_mysql(
+/*================================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name,		/*!< in: savepoint name */
+	int64_t*	mysql_binlog_cache_pos)	/*!< out: the MySQL binlog cache
+						position corresponding to this
+						savepoint; MySQL needs this
+						information to remove the
+						binlog entries of the queries
+						executed after the savepoint */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*******************************************************************//**
+Creates a named savepoint. If the transaction is not yet started, starts it.
+If there is already a savepoint of the same name, this call erases that old
+savepoint and replaces it with a new. Savepoints are deleted in a transaction
+commit or rollback.
+@return always DB_SUCCESS */
+dberr_t
+trx_savepoint_for_mysql(
+/*====================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name,		/*!< in: savepoint name */
+	int64_t		binlog_cache_pos)	/*!< in: MySQL binlog cache
+						position corresponding to this
+						connection at the time of the
+						savepoint */
+	MY_ATTRIBUTE((nonnull));
+/*******************************************************************//**
+Releases a named savepoint. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+dberr_t
+trx_release_savepoint_for_mysql(
+/*============================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name)		/*!< in: savepoint name */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Rollback node states */
+enum roll_node_state {
+	ROLL_NODE_NONE = 0,		/*!< Unknown state */
+	ROLL_NODE_SEND,			/*!< about to send a rollback signal to
+					the transaction */
+	ROLL_NODE_WAIT			/*!< rollback signal sent to the
+					transaction, waiting for completion */
+};
+
+/** Rollback command node in a query graph */
+struct roll_node_t{
+	que_common_t		common;	/*!< node type: QUE_NODE_ROLLBACK */
+	enum roll_node_state	state;	/*!< node execution state */
+	const trx_savept_t*	savept;	/*!< savepoint to which to
+					roll back, in the case of a
+					partial rollback */
+	que_thr_t*		undo_thr;/*!< undo query graph */
+};
+
+/** A savepoint set with SQL's "SAVEPOINT savepoint_id" command */
+struct trx_named_savept_t{
+	char*		name;		/*!< savepoint name */
+	trx_savept_t	savept;		/*!< the undo number corresponding to
+					the savepoint */
+	int64_t		mysql_binlog_cache_pos;
+					/*!< the MySQL binlog cache position
+					corresponding to this savepoint, not
+					defined if the MySQL binlogging is not
+					enabled */
+	UT_LIST_NODE_T(trx_named_savept_t)
+			trx_savepoints;	/*!< the list of savepoints of a
+					transaction */
+};
+
+#endif
diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h
new file mode 100644
index 00000000..43e0c290
--- /dev/null
+++ b/storage/innobase/include/trx0rseg.h
@@ -0,0 +1,301 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rseg.h
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "trx0types.h"
+#include "fut0lst.h"
+
+/** Create a rollback segment header.
+@param[in,out]  space           system, undo, or temporary tablespace
+@param[in]      rseg_id         rollback segment identifier
+@param[in]      max_trx_id      new value of TRX_RSEG_MAX_TRX_ID
+@param[in,out]  mtr             mini-transaction
+@param[out]     err             error code
+@return the created rollback segment
+@retval nullptr on failure */
+buf_block_t *trx_rseg_header_create(fil_space_t *space, ulint rseg_id,
+                                    trx_id_t max_trx_id, mtr_t *mtr,
+                                    dberr_t *err)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Initialize or recover the rollback segments at startup. */
+dberr_t trx_rseg_array_init();
+
+/** Create the temporary rollback segments. */
+dberr_t trx_temp_rseg_create(mtr_t *mtr);
+
+/* Number of undo log slots in a rollback segment file copy */
+#define TRX_RSEG_N_SLOTS	(srv_page_size / 16)
+
+/* Maximum number of transactions supported by a single rollback segment */
+#define TRX_RSEG_MAX_N_TRXS	(TRX_RSEG_N_SLOTS / 2)
+
+/** The rollback segment memory object */
+struct alignas(CPU_LEVEL1_DCACHE_LINESIZE) trx_rseg_t
+{
+  /** tablespace containing the rollback segment; constant after init() */
+  fil_space_t *space;
+  /** latch protecting everything except page_no, space */
+  srw_spin_lock latch;
+  /** rollback segment header page number; constant after init() */
+  uint32_t page_no;
+  /** length of the TRX_RSEG_HISTORY list (number of transactions) */
+  uint32_t history_size;
+
+  /** Last known transaction that has not been purged yet,
+  or 0 if everything has been purged. */
+  trx_id_t needs_purge;
+
+private:
+  /** Reference counter to track is_persistent() transactions,
+  with SKIP flag. */
+  std::atomic<uint32_t> ref;
+
+  /** Whether undo tablespace truncation is pending */
+  static constexpr uint32_t SKIP= 1;
+  /** Transaction reference count multiplier */
+  static constexpr uint32_t REF= 2;
+
+  uint32_t ref_load() const { return ref.load(std::memory_order_relaxed); }
+
+  /** Set the SKIP bit */
+  void ref_set_skip()
+  {
+    static_assert(SKIP == 1U, "compatibility");
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    __asm__ __volatile__("lock btsl $0, %0" : "+m" (ref));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+    _interlockedbittestandset(reinterpret_cast<volatile long*>(&ref), 0);
+#else
+    ref.fetch_or(SKIP, std::memory_order_relaxed);
+#endif
+  }
+  /** Clear a bit in ref */
+  void ref_reset_skip()
+  {
+    static_assert(SKIP == 1U, "compatibility");
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    __asm__ __volatile__("lock btrl $0, %0" : "+m" (ref));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+    _interlockedbittestandreset(reinterpret_cast<volatile long*>(&ref), 0);
+#else
+    ref.fetch_and(~SKIP, std::memory_order_relaxed);
+#endif
+  }
+
+public:
+
+  /** Initialize the fields that are not zero-initialized. */
+  void init(fil_space_t *space, uint32_t page);
+  /** Reinitialize the fields on undo tablespace truncation. */
+  void reinit(uint32_t page);
+  /** Clean up. */
+  void destroy();
+
+  /** Note that undo tablespace truncation was started. */
+  void set_skip_allocation() { ut_ad(is_persistent()); ref_set_skip(); }
+  /** Note that undo tablespace truncation was completed. */
+  void clear_skip_allocation()
+  {
+    ut_ad(is_persistent());
+#if defined DBUG_OFF
+    ref_reset_skip();
+#else
+    ut_d(auto r=) ref.fetch_and(~SKIP, std::memory_order_relaxed);
+    ut_ad(r == SKIP);
+#endif
+  }
+  /** @return whether the segment is marked for undo truncation */
+  bool skip_allocation() const
+  { return ref.load(std::memory_order_acquire) & SKIP; }
+  /** Increment the reference count */
+  void acquire()
+  { ut_d(auto r=) ref.fetch_add(REF); ut_ad(!(r & SKIP)); }
+  /** Increment the reference count if possible
+  @retval true  if the reference count was incremented
+  @retval false if skip_allocation() holds */
+  bool acquire_if_available()
+  {
+    uint32_t r= 0;
+    while (!ref.compare_exchange_weak(r, r + REF,
+                                      std::memory_order_relaxed,
+                                      std::memory_order_relaxed))
+      if (r & SKIP)
+        return false;
+    return true;
+  }
+
+  /** Decrement the reference count */
+  void release()
+  {
+    ut_d(const auto r=)
+    ref.fetch_sub(REF, std::memory_order_relaxed);
+    ut_ad(r >= REF);
+  }
+  /** @return whether references exist */
+  bool is_referenced() const { return ref_load() >= REF; }
+
+  /** current size in pages */
+  uint32_t curr_size;
+
+  /** List of undo logs (transactions) */
+  UT_LIST_BASE_NODE_T(trx_undo_t) undo_list;
+  /** List of undo log segments cached for fast reuse */
+  UT_LIST_BASE_NODE_T(trx_undo_t) undo_cached;
+
+  /** Last not yet purged undo log header; FIL_NULL if all purged */
+  uint32_t last_page_no;
+
+  /** trx_t::no | last_offset << 48 */
+  uint64_t last_commit_and_offset;
+
+  /** @return the commit ID of the last committed transaction */
+  trx_id_t last_trx_no() const
+  { return last_commit_and_offset & ((1ULL << 48) - 1); }
+  /** @return header offset of the last committed transaction */
+  uint16_t last_offset() const
+  { return static_cast<uint16_t>(last_commit_and_offset >> 48); }
+
+  void set_last_commit(uint16_t last_offset, trx_id_t trx_no)
+  {
+    last_commit_and_offset= static_cast<uint64_t>(last_offset) << 48 | trx_no;
+  }
+
+  /** @return the page identifier */
+  page_id_t page_id() const { return page_id_t{space->id, page_no}; }
+
+  /** @return the rollback segment header page, exclusively latched */
+  buf_block_t *get(mtr_t *mtr, dberr_t *err) const;
+
+  /** @return whether the rollback segment is persistent */
+  bool is_persistent() const
+  {
+    ut_ad(space == fil_system.temp_space || space == fil_system.sys_space ||
+          (srv_undo_space_id_start > 0 &&
+           space->id >= srv_undo_space_id_start &&
+           space->id <= srv_undo_space_id_start + TRX_SYS_MAX_UNDO_SPACES));
+    ut_ad(space == fil_system.temp_space || space == fil_system.sys_space ||
+          !srv_was_started ||
+          (srv_undo_space_id_start > 0 &&
+           space->id >= srv_undo_space_id_start
+           && space->id <= srv_undo_space_id_start +
+           srv_undo_tablespaces_open));
+    return space->id != SRV_TMP_SPACE_ID;
+  }
+};
+
+/* Undo log segment slot in a rollback segment header */
+/*-------------------------------------------------------------*/
+#define	TRX_RSEG_SLOT_PAGE_NO	0	/* Page number of the header page of
+					an undo log segment */
+/*-------------------------------------------------------------*/
+/* Slot size */
+#define TRX_RSEG_SLOT_SIZE	4
+
+/* The offset of the rollback segment header on its page */
+#define	TRX_RSEG		FSEG_PAGE_DATA
+
+/* Transaction rollback segment header */
+/*-------------------------------------------------------------*/
+/** 0xfffffffe = pre-MariaDB 10.3.5 format; 0=MariaDB 10.3.5 or later */
+#define	TRX_RSEG_FORMAT		0
+/** Number of pages in the TRX_RSEG_HISTORY list */
+#define	TRX_RSEG_HISTORY_SIZE	4
+/** Committed transaction logs that have not been purged yet */
+#define	TRX_RSEG_HISTORY	8
+#define	TRX_RSEG_FSEG_HEADER	(8 + FLST_BASE_NODE_SIZE)
+					/* Header for the file segment where
+					this page is placed */
+#define TRX_RSEG_UNDO_SLOTS	(8 + FLST_BASE_NODE_SIZE + FSEG_HEADER_SIZE)
+					/* Undo log segment slots */
+/** Maximum transaction ID (valid only if TRX_RSEG_FORMAT is 0) */
+#define TRX_RSEG_MAX_TRX_ID	(TRX_RSEG_UNDO_SLOTS + TRX_RSEG_N_SLOTS	\
+				 * TRX_RSEG_SLOT_SIZE)
+
+/** 8 bytes offset within the binlog file */
+#define TRX_RSEG_BINLOG_OFFSET		TRX_RSEG_MAX_TRX_ID + 8
+/** MySQL log file name, 512 bytes, including terminating NUL
+(valid only if TRX_RSEG_FORMAT is 0).
+If no binlog information is present, the first byte is NUL. */
+#define TRX_RSEG_BINLOG_NAME		TRX_RSEG_MAX_TRX_ID + 16
+/** Maximum length of binlog file name, including terminating NUL, in bytes */
+#define TRX_RSEG_BINLOG_NAME_LEN	512
+
+#ifdef WITH_WSREP
+# include "trx0xa.h"
+
+/** Update the WSREP XID information in rollback segment header.
+@param[in,out]	rseg_header	rollback segment header
+@param[in]	xid		WSREP XID
+@param[in,out]	mtr		mini-transaction */
+void
+trx_rseg_update_wsrep_checkpoint(
+	buf_block_t*	rseg_header,
+	const XID*	xid,
+	mtr_t*		mtr);
+
+/** Update WSREP checkpoint XID in first rollback segment header
+as part of wsrep_set_SE_checkpoint() when it is guaranteed that there
+are no wsrep transactions committing.
+If the UUID part of the WSREP XID does not match to the UUIDs of XIDs already
+stored into rollback segments, the WSREP XID in all the remaining rollback
+segments will be reset.
+@param[in]	xid		WSREP XID */
+void trx_rseg_update_wsrep_checkpoint(const XID* xid);
+
+/** Recover the latest WSREP checkpoint XID.
+@param[out]	xid	WSREP XID
+@return	whether the WSREP XID was found */
+bool trx_rseg_read_wsrep_checkpoint(XID& xid);
+#endif /* WITH_WSREP */
+
+/** Read the page number of an undo log slot.
+@param[in]      rseg_header     rollback segment header
+@param[in]      n               slot number */
+inline uint32_t trx_rsegf_get_nth_undo(const buf_block_t *rseg_header, ulint n)
+{
+  ut_ad(n < TRX_RSEG_N_SLOTS);
+  return mach_read_from_4(TRX_RSEG + TRX_RSEG_UNDO_SLOTS +
+                          n * TRX_RSEG_SLOT_SIZE + rseg_header->page.frame);
+}
+
+/** Upgrade a rollback segment header page to MariaDB 10.3 format.
+@param[in,out]	rseg_header	rollback segment header page
+@param[in,out]	mtr		mini-transaction */
+void trx_rseg_format_upgrade(buf_block_t *rseg_header, mtr_t *mtr);
+
+/** Update the offset information about the end of the binlog entry
+which corresponds to the transaction just being committed.
+In a replication slave, this updates the master binlog position
+up to which replication has proceeded.
+@param[in,out]	rseg_header	rollback segment header
+@param[in]	log_file_name	binlog file name
+@param[in]	log_offset	binlog offset value
+@param[in,out]	mtr		mini-transaction */
+void trx_rseg_update_binlog_offset(buf_block_t *rseg_header,
+                                   const char *log_file_name,
+                                   ulonglong log_offset,
+                                   mtr_t *mtr);
diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h
new file mode 100644
index 00000000..5dd0169f
--- /dev/null
+++ b/storage/innobase/include/trx0sys.h
@@ -0,0 +1,1274 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0sys.h
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "buf0buf.h"
+#include "fil0fil.h"
+#include "trx0rseg.h"
+#include "mem0mem.h"
+#include "mtr0mtr.h"
+#include "ut0byte.h"
+#include "ut0lst.h"
+#include "read0types.h"
+#include "page0types.h"
+#include "trx0trx.h"
+#include "ilist.h"
+#include "my_cpu.h"
+
+#ifdef UNIV_PFS_MUTEX
+extern mysql_pfs_key_t trx_sys_mutex_key;
+#endif
+
+/** Checks if a page address is the trx sys header page.
+@param[in]	page_id	page id
+@return true if trx sys header page */
+inline bool trx_sys_hdr_page(const page_id_t page_id)
+{
+  return page_id == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO);
+}
+
+/*****************************************************************//**
+Creates and initializes the transaction system at the database creation. */
+dberr_t trx_sys_create_sys_pages(mtr_t *mtr);
+
+/** Find an available rollback segment.
+@param[in]	sys_header
+@return an unallocated rollback segment slot in the TRX_SYS header
+@retval ULINT_UNDEFINED if not found */
+ulint
+trx_sys_rseg_find_free(const buf_block_t* sys_header);
+/** Request the TRX_SYS page.
+@param[in]	rw	whether to lock the page for writing
+@return the TRX_SYS page
+@retval	NULL	if the page cannot be read */
+inline buf_block_t *trx_sysf_get(mtr_t* mtr, bool rw= true)
+{
+  return buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
+                      0, rw ? RW_X_LATCH : RW_S_LATCH, mtr);
+}
+
+#ifdef UNIV_DEBUG
+/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */
+extern uint			trx_rseg_n_slots_debug;
+#endif
+
+/** Write DB_TRX_ID.
+@param[out]	db_trx_id	the DB_TRX_ID field to be written to
+@param[in]	id		transaction ID */
+UNIV_INLINE
+void
+trx_write_trx_id(byte* db_trx_id, trx_id_t id)
+{
+	compile_time_assert(DATA_TRX_ID_LEN == 6);
+	mach_write_to_6(db_trx_id, id);
+}
+
+/** Read a transaction identifier.
+@return id */
+inline
+trx_id_t
+trx_read_trx_id(const byte* ptr)
+{
+	compile_time_assert(DATA_TRX_ID_LEN == 6);
+	return(mach_read_from_6(ptr));
+}
+
+#ifdef UNIV_DEBUG
+/** Check that the DB_TRX_ID in a record is valid.
+@param[in]	db_trx_id	the DB_TRX_ID column to validate
+@param[in]	trx_id		the id of the ALTER TABLE transaction */
+inline bool trx_id_check(const void* db_trx_id, trx_id_t trx_id)
+{
+	trx_id_t id = trx_read_trx_id(static_cast<const byte*>(db_trx_id));
+	ut_ad(id == 0 || id > trx_id);
+	return true;
+}
+#endif
+
+/*****************************************************************//**
+Updates the offset information about the end of the MySQL binlog entry
+which corresponds to the transaction just being committed. In a MySQL
+replication slave updates the latest master binlog position up to which
+replication has proceeded. */
+void
+trx_sys_update_mysql_binlog_offset(
+/*===============================*/
+	const char*	file_name,/*!< in: MySQL log file name */
+	int64_t		offset,	/*!< in: position in that log file */
+	buf_block_t*	sys_header, /*!< in,out: trx sys header */
+	mtr_t*		mtr);	/*!< in,out: mini-transaction */
+/** Display the MySQL binlog offset info if it is present in the trx
+system header. */
+void
+trx_sys_print_mysql_binlog_offset();
+
+/** Create the rollback segments.
+@return	whether the creation succeeded */
+bool
+trx_sys_create_rsegs();
+
+/** The offset of the transaction system header on the page */
+#define	TRX_SYS		FSEG_PAGE_DATA
+
+/** Transaction system header */
+/*------------------------------------------------------------- @{ */
+/** In old versions of InnoDB, this persisted the value of
+trx_sys.get_max_trx_id(). Starting with MariaDB 10.3.5,
+the field TRX_RSEG_MAX_TRX_ID in rollback segment header pages
+and the fields TRX_UNDO_TRX_ID, TRX_UNDO_TRX_NO in undo log pages
+are used instead. The field only exists for the purpose of upgrading
+from older MySQL or MariaDB versions. */
+#define	TRX_SYS_TRX_ID_STORE	0
+#define TRX_SYS_FSEG_HEADER	8	/*!< segment header for the
+					tablespace segment the trx
+					system is created into */
+#define	TRX_SYS_RSEGS		(8 + FSEG_HEADER_SIZE)
+					/*!< the start of the array of
+					rollback segment specification
+					slots */
+
+/* Rollback segment specification slot offsets */
+
+/** the tablespace ID of an undo log header; starting with
+MySQL/InnoDB 5.1.7, this is FIL_NULL if the slot is unused */
+#define	TRX_SYS_RSEG_SPACE	0
+/** the page number of an undo log header, or FIL_NULL if unused */
+#define	TRX_SYS_RSEG_PAGE_NO	4
+/** Size of a rollback segment specification slot */
+#define TRX_SYS_RSEG_SLOT_SIZE	8
+
+/** Read the tablespace ID of a rollback segment slot.
+@param[in]	sys_header	TRX_SYS page
+@param[in]	rseg_id		rollback segment identifier
+@return	undo tablespace id */
+inline
+uint32_t
+trx_sysf_rseg_get_space(const buf_block_t* sys_header, ulint rseg_id)
+{
+	ut_ad(rseg_id < TRX_SYS_N_RSEGS);
+	return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE
+				+ rseg_id * TRX_SYS_RSEG_SLOT_SIZE
+				+ sys_header->page.frame);
+}
+
+/** Read the page number of a rollback segment slot.
+@param[in]	sys_header	TRX_SYS page
+@param[in]	rseg_id		rollback segment identifier
+@return	undo page number */
+inline uint32_t
+trx_sysf_rseg_get_page_no(const buf_block_t *sys_header, ulint rseg_id)
+{
+  ut_ad(rseg_id < TRX_SYS_N_RSEGS);
+  return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO +
+			  rseg_id * TRX_SYS_RSEG_SLOT_SIZE +
+			  sys_header->page.frame);
+}
+
+/** Maximum length of MySQL binlog file name, in bytes.
+(Used before MariaDB 10.3.5.) */
+#define TRX_SYS_MYSQL_LOG_NAME_LEN	512
+/** Contents of TRX_SYS_MYSQL_LOG_MAGIC_N_FLD */
+#define TRX_SYS_MYSQL_LOG_MAGIC_N	873422344
+
+#if UNIV_PAGE_SIZE_MIN < 4096
+# error "UNIV_PAGE_SIZE_MIN < 4096"
+#endif
+/** The offset of the MySQL binlog offset info in the trx system header */
+#define TRX_SYS_MYSQL_LOG_INFO		(srv_page_size - 1000)
+#define	TRX_SYS_MYSQL_LOG_MAGIC_N_FLD	0	/*!< magic number which is
+						TRX_SYS_MYSQL_LOG_MAGIC_N
+						if we have valid data in the
+						MySQL binlog info */
+#define TRX_SYS_MYSQL_LOG_OFFSET	4	/*!< the 64-bit offset
+						within that file */
+#define TRX_SYS_MYSQL_LOG_NAME		12	/*!< MySQL log file name */
+
+/** Memory map TRX_SYS_PAGE_NO = 5 when srv_page_size = 4096
+
+0...37 FIL_HEADER
+38...45 TRX_SYS_TRX_ID_STORE
+46...55 TRX_SYS_FSEG_HEADER (FSEG_HEADER_SIZE == 10)
+56      TRX_SYS_RSEGS
+  56...59  TRX_SYS_RSEG_SPACE       for slot 0
+  60...63  TRX_SYS_RSEG_PAGE_NO     for slot 0
+  64...67  TRX_SYS_RSEG_SPACE       for slot 1
+  68...71  TRX_SYS_RSEG_PAGE_NO     for slot 1
+....
+ 594..597  TRX_SYS_RSEG_SPACE       for slot 72
+ 598..601  TRX_SYS_RSEG_PAGE_NO     for slot 72
+...
+  ...1063  TRX_SYS_RSEG_PAGE_NO     for slot 126
+
+(srv_page_size-3500 WSREP ::: FAIL would overwrite undo tablespace
+space_id, page_no pairs :::)
+596 TRX_SYS_WSREP_XID_INFO             TRX_SYS_WSREP_XID_MAGIC_N_FLD
+600 TRX_SYS_WSREP_XID_FORMAT
+604 TRX_SYS_WSREP_XID_GTRID_LEN
+608 TRX_SYS_WSREP_XID_BQUAL_LEN
+612 TRX_SYS_WSREP_XID_DATA   (len = 128)
+739 TRX_SYS_WSREP_XID_DATA_END
+
+FIXED WSREP XID info offsets for 4k page size 10.0.32-galera
+(srv_page_size-2500)
+1596 TRX_SYS_WSREP_XID_INFO             TRX_SYS_WSREP_XID_MAGIC_N_FLD
+1600 TRX_SYS_WSREP_XID_FORMAT
+1604 TRX_SYS_WSREP_XID_GTRID_LEN
+1608 TRX_SYS_WSREP_XID_BQUAL_LEN
+1612 TRX_SYS_WSREP_XID_DATA   (len = 128)
+1739 TRX_SYS_WSREP_XID_DATA_END
+
+(srv_page_size - 2000 MYSQL MASTER LOG)
+2096   TRX_SYS_MYSQL_MASTER_LOG_INFO   TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
+2100   TRX_SYS_MYSQL_LOG_OFFSET_HIGH
+2104   TRX_SYS_MYSQL_LOG_OFFSET_LOW
+2108   TRX_SYS_MYSQL_LOG_NAME
+
+(srv_page_size - 1000 MYSQL LOG)
+3096   TRX_SYS_MYSQL_LOG_INFO          TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
+3100   TRX_SYS_MYSQL_LOG_OFFSET_HIGH
+3104   TRX_SYS_MYSQL_LOG_OFFSET_LOW
+3108   TRX_SYS_MYSQL_LOG_NAME
+
+(srv_page_size - 200 DOUBLEWRITE)
+3896   TRX_SYS_DOUBLEWRITE		TRX_SYS_DOUBLEWRITE_FSEG
+3906         TRX_SYS_DOUBLEWRITE_MAGIC
+3910         TRX_SYS_DOUBLEWRITE_BLOCK1
+3914         TRX_SYS_DOUBLEWRITE_BLOCK2
+3918         TRX_SYS_DOUBLEWRITE_REPEAT
+3930         TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N
+
+(srv_page_size - 8, TAILER)
+4088..4096	FIL_TAILER
+
+*/
+#ifdef WITH_WSREP
+/** The offset to WSREP XID headers (used before MariaDB 10.3.5) */
+#define TRX_SYS_WSREP_XID_INFO std::max(srv_page_size - 3500, 1596UL)
+#define TRX_SYS_WSREP_XID_MAGIC_N_FLD 0
+#define TRX_SYS_WSREP_XID_MAGIC_N 0x77737265
+
+/** XID field: formatID, gtrid_len, bqual_len, xid_data */
+#define TRX_SYS_WSREP_XID_LEN        (4 + 4 + 4 + XIDDATASIZE)
+#define TRX_SYS_WSREP_XID_FORMAT     4
+#define TRX_SYS_WSREP_XID_GTRID_LEN  8
+#define TRX_SYS_WSREP_XID_BQUAL_LEN 12
+#define TRX_SYS_WSREP_XID_DATA      16
+#endif /* WITH_WSREP*/
+
+/** Doublewrite buffer */
+/* @{ */
+/** The offset of the doublewrite buffer header on the trx system header page */
+#define TRX_SYS_DOUBLEWRITE		(srv_page_size - 200)
+/*-------------------------------------------------------------*/
+#define TRX_SYS_DOUBLEWRITE_FSEG	0	/*!< fseg header of the fseg
+						containing the doublewrite
+						buffer */
+#define TRX_SYS_DOUBLEWRITE_MAGIC	FSEG_HEADER_SIZE
+						/*!< 4-byte magic number which
+						shows if we already have
+						created the doublewrite
+						buffer */
+#define TRX_SYS_DOUBLEWRITE_BLOCK1	(4 + FSEG_HEADER_SIZE)
+						/*!< page number of the
+						first page in the first
+						sequence of 64
+						(= FSP_EXTENT_SIZE) consecutive
+						pages in the doublewrite
+						buffer */
+#define TRX_SYS_DOUBLEWRITE_BLOCK2	(8 + FSEG_HEADER_SIZE)
+						/*!< page number of the
+						first page in the second
+						sequence of 64 consecutive
+						pages in the doublewrite
+						buffer */
+#define TRX_SYS_DOUBLEWRITE_REPEAT	12	/*!< we repeat
+						TRX_SYS_DOUBLEWRITE_MAGIC,
+						TRX_SYS_DOUBLEWRITE_BLOCK1,
+						TRX_SYS_DOUBLEWRITE_BLOCK2
+						so that if the trx sys
+						header is half-written
+						to disk, we still may
+						be able to recover the
+						information */
+/** If this is not yet set to TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+we must reset the doublewrite buffer, because starting from 4.1.x the
+space id of a data page is stored into
+FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
+#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE)
+
+/*-------------------------------------------------------------*/
+/** Contents of TRX_SYS_DOUBLEWRITE_MAGIC */
+constexpr uint32_t TRX_SYS_DOUBLEWRITE_MAGIC_N= 536853855;
+/** Contents of TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED */
+constexpr uint32_t TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N= 1783657386;
+/* @} */
+
+trx_t* current_trx();
+
+struct rw_trx_hash_element_t
+{
+  rw_trx_hash_element_t()
+  {
+    memset(reinterpret_cast<void*>(this), 0, sizeof *this);
+    mutex.init();
+  }
+
+
+  ~rw_trx_hash_element_t() { mutex.destroy(); }
+
+
+  trx_id_t id; /* lf_hash_init() relies on this to be first in the struct */
+
+  /**
+    Transaction serialization number.
+
+    Assigned shortly before the transaction is moved to COMMITTED_IN_MEMORY
+    state. Initially set to TRX_ID_MAX.
+  */
+  Atomic_counter<trx_id_t> no;
+  trx_t *trx;
+  srw_mutex mutex;
+};
+
+
+/**
+  Wrapper around LF_HASH to store set of in memory read-write transactions.
+*/
+
+class rw_trx_hash_t
+{
+  LF_HASH hash;
+
+
+  template <typename T>
+  using walk_action= my_bool(rw_trx_hash_element_t *element, T *action);
+
+
+  /**
+    Constructor callback for lock-free allocator.
+
+    Object is just allocated and is not yet accessible via rw_trx_hash by
+    concurrent threads. Object can be reused multiple times before it is freed.
+    Every time object is being reused initializer() callback is called.
+  */
+
+  static void rw_trx_hash_constructor(uchar *arg)
+  {
+    new(arg + LF_HASH_OVERHEAD) rw_trx_hash_element_t();
+  }
+
+
+  /**
+    Destructor callback for lock-free allocator.
+
+    Object is about to be freed and is not accessible via rw_trx_hash by
+    concurrent threads.
+  */
+
+  static void rw_trx_hash_destructor(uchar *arg)
+  {
+    reinterpret_cast<rw_trx_hash_element_t*>
+      (arg + LF_HASH_OVERHEAD)->~rw_trx_hash_element_t();
+  }
+
+
+  /**
+    Destructor callback for lock-free allocator.
+
+    This destructor is used at shutdown. It frees remaining transaction
+    objects.
+
+    XA PREPARED transactions may remain if they haven't been committed or
+    rolled back. ACTIVE transactions may remain if startup was interrupted or
+    server is running in read-only mode or for certain srv_force_recovery
+    levels.
+  */
+
+  static void rw_trx_hash_shutdown_destructor(uchar *arg)
+  {
+    rw_trx_hash_element_t *element=
+      reinterpret_cast<rw_trx_hash_element_t*>(arg + LF_HASH_OVERHEAD);
+    if (trx_t *trx= element->trx)
+    {
+      ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED) ||
+            trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) ||
+            (trx_state_eq(trx, TRX_STATE_ACTIVE) &&
+             (!srv_was_started ||
+              srv_read_only_mode ||
+              srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO)));
+      trx_free_at_shutdown(trx);
+    }
+    element->~rw_trx_hash_element_t();
+  }
+
+
+  /**
+    Initializer callback for lock-free hash.
+
+    Object is not yet accessible via rw_trx_hash by concurrent threads, but is
+    about to become such. Object id can be changed only by this callback and
+    remains the same until all pins to this object are released.
+
+    Object trx can be changed to 0 by erase() under object mutex protection,
+    which indicates it is about to be removed from lock-free hash and become
+    not accessible by concurrent threads.
+  */
+
+  static void rw_trx_hash_initializer(LF_HASH *,
+                                      rw_trx_hash_element_t *element,
+                                      trx_t *trx)
+  {
+    ut_ad(element->trx == 0);
+    element->trx= trx;
+    element->id= trx->id;
+    element->no= TRX_ID_MAX;
+    trx->rw_trx_hash_element= element;
+  }
+
+
+  /**
+    Gets LF_HASH pins.
+
+    Pins are used to protect object from being destroyed or reused. They are
+    normally stored in trx object for quick access. If caller doesn't have trx
+    available, we try to get it using currnet_trx(). If caller doesn't have trx
+    at all, temporary pins are allocated.
+  */
+
+  LF_PINS *get_pins(trx_t *trx)
+  {
+    if (!trx->rw_trx_hash_pins)
+    {
+      trx->rw_trx_hash_pins= lf_hash_get_pins(&hash);
+      ut_a(trx->rw_trx_hash_pins);
+    }
+    return trx->rw_trx_hash_pins;
+  }
+
+
+  template <typename T> struct eliminate_duplicates_arg
+  {
+    trx_ids_t ids;
+    walk_action<T> *action;
+    T *argument;
+    eliminate_duplicates_arg(size_t size, walk_action<T> *act, T *arg):
+      action(act), argument(arg) { ids.reserve(size); }
+  };
+
+
+  template <typename T>
+  static my_bool eliminate_duplicates(rw_trx_hash_element_t *element,
+                                      eliminate_duplicates_arg<T> *arg)
+  {
+    for (trx_ids_t::iterator it= arg->ids.begin(); it != arg->ids.end(); it++)
+    {
+      if (*it == element->id)
+        return 0;
+    }
+    arg->ids.push_back(element->id);
+    return arg->action(element, arg->argument);
+  }
+
+
+#ifdef UNIV_DEBUG
+  static void validate_element(trx_t *trx)
+  {
+    ut_ad(!trx->read_only || !trx->rsegs.m_redo.rseg);
+    ut_ad(!trx->is_autocommit_non_locking());
+    /* trx->state can be anything except TRX_STATE_NOT_STARTED */
+    ut_d(trx->mutex_lock());
+    ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
+          trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) ||
+          trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) ||
+          trx_state_eq(trx, TRX_STATE_PREPARED));
+    ut_d(trx->mutex_unlock());
+  }
+
+
+  template <typename T> struct debug_iterator_arg
+  {
+    walk_action<T> *action;
+    T *argument;
+  };
+
+
+  template <typename T>
+  static my_bool debug_iterator(rw_trx_hash_element_t *element,
+                                debug_iterator_arg<T> *arg)
+  {
+    element->mutex.wr_lock();
+    if (element->trx)
+      validate_element(element->trx);
+    element->mutex.wr_unlock();
+    ut_ad(element->id < element->no);
+    return arg->action(element, arg->argument);
+  }
+#endif
+
+
+public:
+  void init()
+  {
+    lf_hash_init(&hash, sizeof(rw_trx_hash_element_t), LF_HASH_UNIQUE, 0,
+                 sizeof(trx_id_t), 0, &my_charset_bin);
+    hash.alloc.constructor= rw_trx_hash_constructor;
+    hash.alloc.destructor= rw_trx_hash_destructor;
+    hash.initializer=
+      reinterpret_cast<lf_hash_initializer>(rw_trx_hash_initializer);
+  }
+
+
+  void destroy()
+  {
+    hash.alloc.destructor= rw_trx_hash_shutdown_destructor;
+    lf_hash_destroy(&hash);
+  }
+
+
+  /**
+    Releases LF_HASH pins.
+
+    Must be called by thread that owns trx_t object when the latter is being
+    "detached" from thread (e.g. released to the pool by trx_t::free()). Can be
+    called earlier if thread is expected not to use rw_trx_hash.
+
+    Since pins are not allowed to be transferred to another thread,
+    initialisation thread calls this for recovered transactions.
+  */
+
+  void put_pins(trx_t *trx)
+  {
+    if (trx->rw_trx_hash_pins)
+    {
+      lf_hash_put_pins(trx->rw_trx_hash_pins);
+      trx->rw_trx_hash_pins= 0;
+    }
+  }
+
+
+  /**
+    Finds trx object in lock-free hash with given id.
+
+    Only ACTIVE or PREPARED trx objects may participate in hash. Nevertheless
+    the transaction may get committed before this method returns.
+
+    With do_ref_count == false the caller may dereference returned trx pointer
+    only if lock_sys.latch was acquired before calling find().
+
+    With do_ref_count == true caller may dereference trx even if it is not
+    holding lock_sys.latch. Caller is responsible for calling
+    trx->release_reference() when it is done playing with trx.
+
+    Ideally this method should get caller rw_trx_hash_pins along with trx
+    object as a parameter, similar to insert() and erase(). However most
+    callers lose trx early in their call chains and it is not that easy to pass
+    them through.
+
+    So we take more expensive approach: get trx through current_thd()->ha_data.
+    Some threads don't have trx attached to THD, and at least server
+    initialisation thread, fts_optimize_thread, srv_master_thread,
+    dict_stats_thread, srv_monitor_thread, btr_defragment_thread don't even
+    have THD at all. For such cases we allocate pins only for duration of
+    search and free them immediately.
+
+    This has negative performance impact and should be fixed eventually (by
+    passing caller_trx as a parameter). Still stream of DML is more or less Ok.
+
+    @return
+      @retval 0 not found
+      @retval pointer to trx
+  */
+
+  trx_t *find(trx_t *caller_trx, trx_id_t trx_id, bool do_ref_count)
+  {
+    /*
+      In MariaDB 10.3, purge will reset DB_TRX_ID to 0
+      when the history is lost. Read/write transactions will
+      always have a nonzero trx_t::id; there the value 0 is
+      reserved for transactions that did not write or lock
+      anything yet.
+
+      The caller should already have handled trx_id==0 specially.
+    */
+    ut_ad(trx_id);
+    ut_ad(!caller_trx || caller_trx->id != trx_id || !do_ref_count);
+
+    trx_t *trx= 0;
+    LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash);
+    ut_a(pins);
+
+    rw_trx_hash_element_t *element= reinterpret_cast<rw_trx_hash_element_t*>
+      (lf_hash_search(&hash, pins, reinterpret_cast<const void*>(&trx_id),
+                      sizeof(trx_id_t)));
+    if (element)
+    {
+      /* rw_trx_hash_t::erase() sets element->trx to nullptr under
+      element->mutex protection before removing the element from hash table.
+      If the element was removed before the mutex acquisition, element->trx
+      will be equal to nullptr. */
+      DEBUG_SYNC_C("before_trx_hash_find_element_mutex_enter");
+      element->mutex.wr_lock();
+      /* element_trx can't point to reused object now. If transaction was
+      deregistered before element->mutex acquisition, element->trx is nullptr.
+      It can't be deregistered while element->mutex is held. */
+      trx_t *element_trx = element->trx;
+      lf_hash_search_unpin(pins);
+      /* The *element can be reused now, as element->trx value is stored
+      locally in element_trx. */
+      DEBUG_SYNC_C("after_trx_hash_find_element_mutex_enter");
+      if ((trx= element_trx)) {
+        DBUG_ASSERT(trx_id == trx->id);
+        ut_d(validate_element(trx));
+        if (do_ref_count)
+        {
+          /*
+            We have an early state check here to avoid committer
+            starvation in a wait loop for transaction references,
+            when there's a stream of trx_sys.find() calls from other
+            threads. The trx->state may change to COMMITTED after
+            trx->mutex is released, and it will have to be rechecked
+            by the caller after reacquiring the mutex.
+          */
+          /* trx_t::commit_in_memory() sets the state to
+          TRX_STATE_COMMITTED_IN_MEMORY before deregistering the transaction.
+          It also waits for any implicit-to-explicit lock conversions to cease
+          after deregistering. */
+          if (trx->state == TRX_STATE_COMMITTED_IN_MEMORY)
+            trx= nullptr;
+          else
+            trx->reference();
+        }
+      }
+      /* element's lifetime is equal to the hash lifetime, that's why
+      element->mutex is valid here despite the element is unpinned. In the
+      worst case some thread will wait for element->mutex releasing. */
+      element->mutex.wr_unlock();
+    }
+    if (!caller_trx)
+      lf_hash_put_pins(pins);
+    return trx;
+  }
+
+
+  /**
+    Inserts trx to lock-free hash.
+
+    Object becomes accessible via rw_trx_hash.
+  */
+
+  void insert(trx_t *trx)
+  {
+    ut_d(validate_element(trx));
+    int res= lf_hash_insert(&hash, get_pins(trx),
+                            reinterpret_cast<void*>(trx));
+    ut_a(res == 0);
+  }
+
+
+  /**
+    Removes trx from lock-free hash.
+
+    Object becomes not accessible via rw_trx_hash. But it still can be pinned
+    by concurrent find(), which is supposed to release it immediately after
+    it sees object trx is 0.
+  */
+
+  void erase(trx_t *trx)
+  {
+    ut_d(validate_element(trx));
+    trx->rw_trx_hash_element->mutex.wr_lock();
+    trx->rw_trx_hash_element->trx= nullptr;
+    trx->rw_trx_hash_element->mutex.wr_unlock();
+    int res= lf_hash_delete(&hash, get_pins(trx),
+                            reinterpret_cast<const void*>(&trx->id),
+                            sizeof(trx_id_t));
+    ut_a(res == 0);
+  }
+
+
+  /**
+    Returns the number of elements in the hash.
+
+    The number is exact only if hash is protected against concurrent
+    modifications (e.g. single threaded startup or hash is protected
+    by some mutex). Otherwise the number may be used as a hint only,
+    because it may change even before this method returns.
+  */
+
+  uint32_t size() { return uint32_t(lf_hash_size(&hash)); }
+
+
+  /**
+    Iterates the hash.
+
+    @param caller_trx  used to get/set pins
+    @param action      called for every element in hash
+    @param argument    opque argument passed to action
+
+    May return the same element multiple times if hash is under contention.
+    If caller doesn't like to see the same transaction multiple times, it has
+    to call iterate_no_dups() instead.
+
+    May return element with committed transaction. If caller doesn't like to
+    see committed transactions, it has to skip those under element mutex:
+
+      element->mutex.wr_lock();
+      if (trx_t trx= element->trx)
+      {
+        // trx is protected against commit in this branch
+      }
+      element->mutex.wr_unlock();
+
+    May miss concurrently inserted transactions.
+
+    @return
+      @retval 0 iteration completed successfully
+      @retval 1 iteration was interrupted (action returned 1)
+  */
+
+  template <typename T>
+  int iterate(trx_t *caller_trx, walk_action<T> *action, T *argument= nullptr)
+  {
+    LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash);
+    ut_a(pins);
+#ifdef UNIV_DEBUG
+    debug_iterator_arg<T> debug_arg= { action, argument };
+    action= reinterpret_cast<decltype(action)>(debug_iterator<T>);
+    argument= reinterpret_cast<T*>(&debug_arg);
+#endif
+    int res= lf_hash_iterate(&hash, pins,
+                             reinterpret_cast<my_hash_walk_action>(action),
+                             const_cast<void*>(static_cast<const void*>
+                             (argument)));
+    if (!caller_trx)
+      lf_hash_put_pins(pins);
+    return res;
+  }
+
+
+  template <typename T>
+  int iterate(walk_action<T> *action, T *argument= nullptr)
+  {
+    return iterate(current_trx(), action, argument);
+  }
+
+
+  /**
+    Iterates the hash and eliminates duplicate elements.
+
+    @sa iterate()
+  */
+
+  template <typename T>
+  int iterate_no_dups(trx_t *caller_trx, walk_action<T> *action,
+                      T *argument= nullptr)
+  {
+    eliminate_duplicates_arg<T> arg(size() + 32, action, argument);
+    return iterate(caller_trx, eliminate_duplicates<T>, &arg);
+  }
+
+
+  template <typename T>
+  int iterate_no_dups(walk_action<T> *action, T *argument= nullptr)
+  {
+    return iterate_no_dups(current_trx(), action, argument);
+  }
+};
+
+class thread_safe_trx_ilist_t
+{
+public:
+  void create() { mysql_mutex_init(trx_sys_mutex_key, &mutex, nullptr); }
+  void close() { mysql_mutex_destroy(&mutex); }
+
+  bool empty() const
+  {
+    mysql_mutex_lock(&mutex);
+    auto result= trx_list.empty();
+    mysql_mutex_unlock(&mutex);
+    return result;
+  }
+
+  void push_front(trx_t &trx)
+  {
+    mysql_mutex_lock(&mutex);
+    trx_list.push_front(trx);
+    mysql_mutex_unlock(&mutex);
+  }
+
+  void remove(trx_t &trx)
+  {
+    mysql_mutex_lock(&mutex);
+    trx_list.remove(trx);
+    mysql_mutex_unlock(&mutex);
+  }
+
+  template <typename Callable> void for_each(Callable &&callback) const
+  {
+    mysql_mutex_lock(&mutex);
+    for (const auto &trx : trx_list)
+      callback(trx);
+    mysql_mutex_unlock(&mutex);
+  }
+
+  template <typename Callable> void for_each(Callable &&callback)
+  {
+    mysql_mutex_lock(&mutex);
+    for (auto &trx : trx_list)
+      callback(trx);
+    mysql_mutex_unlock(&mutex);
+  }
+
+  void freeze() const { mysql_mutex_lock(&mutex); }
+  void unfreeze() const { mysql_mutex_unlock(&mutex); }
+
+private:
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) mutable mysql_mutex_t mutex;
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) ilist<trx_t> trx_list;
+};
+
+/** The transaction system central memory data structure. */
+class trx_sys_t
+{
+  /**
+    The smallest number not yet assigned as a transaction id or transaction
+    number. Accessed and updated with atomic operations.
+  */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) Atomic_counter<trx_id_t> m_max_trx_id;
+
+
+  /**
+    Solves race conditions between register_rw() and snapshot_ids() as well as
+    race condition between assign_new_trx_no() and snapshot_ids().
+
+    @sa register_rw()
+    @sa assign_new_trx_no()
+    @sa snapshot_ids()
+  */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE)
+  std::atomic<trx_id_t> m_rw_trx_hash_version;
+
+
+  bool m_initialised;
+
+  /** False if there is no undo log to purge or rollback */
+  bool undo_log_nonempty;
+public:
+  /** List of all transactions. */
+  thread_safe_trx_ilist_t trx_list;
+
+  /** Temporary rollback segments */
+  trx_rseg_t temp_rsegs[TRX_SYS_N_RSEGS];
+
+  /** Persistent rollback segments; space==nullptr if slot not in use */
+  trx_rseg_t rseg_array[TRX_SYS_N_RSEGS];
+
+  /**
+    Lock-free hash of in memory read-write transactions.
+    Works faster when it is on it's own cache line (tested).
+  */
+
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) rw_trx_hash_t rw_trx_hash;
+
+
+#ifdef WITH_WSREP
+  /** Latest recovered XID during startup */
+  XID recovered_wsrep_xid;
+#endif
+  /** Latest recovered binlog offset */
+  uint64_t recovered_binlog_offset;
+  /** Latest recovered binlog file name */
+  char recovered_binlog_filename[TRX_SYS_MYSQL_LOG_NAME_LEN];
+  /** FIL_PAGE_LSN of the page with the latest recovered binlog metadata */
+  lsn_t recovered_binlog_lsn;
+
+
+  /**
+    Constructor.
+
+    Some members may require late initialisation, thus we just mark object as
+    uninitialised. Real initialisation happens in create().
+  */
+
+  trx_sys_t(): m_initialised(false) {}
+
+
+  /**
+    @return TRX_RSEG_HISTORY length (number of committed transactions to purge)
+  */
+  size_t history_size();
+
+
+  /**
+    Check whether history_size() exceeds a specified number.
+    @param threshold   number of committed transactions
+    @return whether TRX_RSEG_HISTORY length exceeds the threshold
+  */
+  bool history_exceeds(size_t threshold);
+
+
+  /**
+    @return approximate history_size(), without latch protection
+  */
+  TPOOL_SUPPRESS_TSAN size_t history_size_approx() const;
+
+
+  /**
+    @return whether history_size() is nonzero (with some race condition)
+  */
+  TPOOL_SUPPRESS_TSAN bool history_exists();
+
+
+  /**
+    Determine if the specified transaction or any older one might be active.
+
+    @param trx         current transaction
+    @param id          transaction identifier
+    @return whether any transaction not newer than id might be active
+  */
+
+  bool find_same_or_older(trx_t *trx, trx_id_t id)
+  {
+    if (trx->max_inactive_id >= id)
+      return false;
+    bool found= rw_trx_hash.iterate(trx, find_same_or_older_callback, &id);
+    if (!found)
+      trx->max_inactive_id= id;
+    return found;
+  }
+
+
+  /**
+    Determines the maximum transaction id.
+
+    @return maximum currently allocated trx id; will be stale after the
+            next call to trx_sys.get_new_trx_id()
+  */
+
+  trx_id_t get_max_trx_id()
+  {
+    return m_max_trx_id;
+  }
+
+
+  /**
+    Allocates a new transaction id.
+    @return new, allocated trx id
+  */
+
+  trx_id_t get_new_trx_id()
+  {
+    trx_id_t id= get_new_trx_id_no_refresh();
+    refresh_rw_trx_hash_version();
+    return id;
+  }
+
+
+  /**
+    Allocates and assigns new transaction serialisation number.
+
+    There's a gap between m_max_trx_id increment and transaction serialisation
+    number becoming visible through rw_trx_hash. While we're in this gap
+    concurrent thread may come and do MVCC snapshot without seeing allocated
+    but not yet assigned serialisation number. Then at some point purge thread
+    may clone this view. As a result it won't see newly allocated serialisation
+    number and may remove "unnecessary" history data of this transaction from
+    rollback segments.
+
+    m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has
+    to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively
+    means that all transaction serialisation numbers up to m_max_trx_id are
+    available through rw_trx_hash.
+
+    We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so
+    that m_rw_trx_hash_version increment happens after
+    trx->rw_trx_hash_element->no becomes visible through rw_trx_hash.
+
+    @param trx transaction
+  */
+  void assign_new_trx_no(trx_t *trx)
+  {
+    trx->rw_trx_hash_element->no= get_new_trx_id_no_refresh();
+    refresh_rw_trx_hash_version();
+  }
+
+
+  /**
+    Takes MVCC snapshot.
+
+    To reduce malloc probablility we reserve rw_trx_hash.size() + 32 elements
+    in ids.
+
+    For details about get_rw_trx_hash_version() != get_max_trx_id() spin
+    @sa register_rw() and @sa assign_new_trx_no().
+
+    We rely on get_rw_trx_hash_version() to issue ACQUIRE memory barrier so
+    that loading of m_rw_trx_hash_version happens before accessing rw_trx_hash.
+
+    To optimise snapshot creation rw_trx_hash.iterate() is being used instead
+    of rw_trx_hash.iterate_no_dups(). It means that some transaction
+    identifiers may appear multiple times in ids.
+
+    @param[in,out] caller_trx used to get access to rw_trx_hash_pins
+    @param[out]    ids        array to store registered transaction identifiers
+    @param[out]    max_trx_id variable to store m_max_trx_id value
+    @param[out]    mix_trx_no variable to store min(no) value
+  */
+
+  void snapshot_ids(trx_t *caller_trx, trx_ids_t *ids, trx_id_t *max_trx_id,
+                    trx_id_t *min_trx_no)
+  {
+    snapshot_ids_arg arg(ids);
+
+    while ((arg.m_id= get_rw_trx_hash_version()) != get_max_trx_id())
+      ut_delay(1);
+    arg.m_no= arg.m_id;
+
+    ids->clear();
+    ids->reserve(rw_trx_hash.size() + 32);
+    rw_trx_hash.iterate(caller_trx, copy_one_id, &arg);
+
+    *max_trx_id= arg.m_id;
+    *min_trx_no= arg.m_no;
+  }
+
+
+  /** Initialiser for m_max_trx_id and m_rw_trx_hash_version. */
+  void init_max_trx_id(trx_id_t value)
+  {
+    m_max_trx_id= value;
+    m_rw_trx_hash_version.store(value, std::memory_order_relaxed);
+  }
+
+
+  bool is_initialised() const { return m_initialised; }
+
+
+  /** Initialise the transaction subsystem. */
+  void create();
+
+  /** Close the transaction subsystem on shutdown. */
+  void close();
+
+  /** @return total number of active (non-prepared) transactions */
+  size_t any_active_transactions(size_t *prepared= nullptr);
+
+
+  /**
+    Determine the rollback segment identifier.
+
+    @param rseg        rollback segment
+    @param persistent  whether the rollback segment is persistent
+    @return the rollback segment identifier
+  */
+  unsigned rseg_id(const trx_rseg_t *rseg, bool persistent) const
+  {
+    const trx_rseg_t *array= persistent ? rseg_array : temp_rsegs;
+    ut_ad(rseg >= array);
+    ut_ad(rseg < &array[TRX_SYS_N_RSEGS]);
+    return static_cast<unsigned>(rseg - array);
+  }
+
+
+  /**
+    Registers read-write transaction.
+
+    Transaction becomes visible to MVCC.
+
+    There's a gap between m_max_trx_id increment and transaction becoming
+    visible through rw_trx_hash. While we're in this gap concurrent thread may
+    come and do MVCC snapshot. As a result concurrent read view will be able to
+    observe records owned by this transaction even before it was committed.
+
+    m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has
+    to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively
+    means that all transactions up to m_max_trx_id are available through
+    rw_trx_hash.
+
+    We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so
+    that m_rw_trx_hash_version increment happens after transaction becomes
+    visible through rw_trx_hash.
+  */
+
+  void register_rw(trx_t *trx)
+  {
+    trx->id= get_new_trx_id_no_refresh();
+    rw_trx_hash.insert(trx);
+    refresh_rw_trx_hash_version();
+  }
+
+
+  /**
+    Deregisters read-write transaction.
+
+    Transaction is removed from rw_trx_hash, which releases all implicit locks.
+    MVCC snapshot won't see this transaction anymore.
+  */
+
+  void deregister_rw(trx_t *trx)
+  {
+    rw_trx_hash.erase(trx);
+  }
+
+
+  bool is_registered(trx_t *caller_trx, trx_id_t id)
+  {
+    return id && find(caller_trx, id, false);
+  }
+
+
+  trx_t *find(trx_t *caller_trx, trx_id_t id, bool do_ref_count= true)
+  {
+    return rw_trx_hash.find(caller_trx, id, do_ref_count);
+  }
+
+
+  /**
+    Registers transaction in trx_sys.
+
+    @param trx transaction
+  */
+  void register_trx(trx_t *trx)
+  {
+    trx_list.push_front(*trx);
+  }
+
+
+  /**
+    Deregisters transaction in trx_sys.
+
+    @param trx transaction
+  */
+  void deregister_trx(trx_t *trx)
+  {
+    trx_list.remove(*trx);
+  }
+
+
+  /**
+    Clones the oldest view and stores it in view.
+
+    No need to call ReadView::close(). The caller owns the view that is passed
+    in. This function is called by purge thread to determine whether it should
+    purge the delete marked record or not.
+  */
+  void clone_oldest_view(ReadViewBase *view) const;
+
+
+  /** @return the number of active views */
+  size_t view_count() const
+  {
+    size_t count= 0;
+
+    trx_list.for_each([&count](const trx_t &trx) {
+      if (trx.read_view.is_open())
+        ++count;
+    });
+
+    return count;
+  }
+
+  /** Set the undo log empty value */
+  void set_undo_non_empty(bool val)
+  {
+    if (!undo_log_nonempty)
+      undo_log_nonempty= val;
+  }
+
+  /** Get the undo log empty value */
+  bool is_undo_empty() const { return !undo_log_nonempty; }
+
+  /* Reset the trx_sys page and retain the dblwr information,
+  system rollback segment header page
+  @return error code */
+  inline dberr_t reset_page(mtr_t *mtr);
+private:
+  static my_bool find_same_or_older_callback(rw_trx_hash_element_t *element,
+                                             trx_id_t *id)
+  {
+    return element->id <= *id;
+  }
+
+
+  struct snapshot_ids_arg
+  {
+    snapshot_ids_arg(trx_ids_t *ids): m_ids(ids) {}
+    trx_ids_t *m_ids;
+    trx_id_t m_id;
+    trx_id_t m_no;
+  };
+
+
+  static my_bool copy_one_id(rw_trx_hash_element_t *element,
+                             snapshot_ids_arg *arg)
+  {
+    if (element->id < arg->m_id)
+    {
+      trx_id_t no= element->no;
+      arg->m_ids->push_back(element->id);
+      if (no < arg->m_no)
+        arg->m_no= no;
+    }
+    return 0;
+  }
+
+
+  /** Getter for m_rw_trx_hash_version, must issue ACQUIRE memory barrier. */
+  trx_id_t get_rw_trx_hash_version()
+  {
+    return m_rw_trx_hash_version.load(std::memory_order_acquire);
+  }
+
+
+  /** Increments m_rw_trx_hash_version, must issue RELEASE memory barrier. */
+  void refresh_rw_trx_hash_version()
+  {
+    m_rw_trx_hash_version.fetch_add(1, std::memory_order_release);
+  }
+
+
+  /**
+    Allocates new transaction id without refreshing rw_trx_hash version.
+
+    This method is extracted for exclusive use by register_rw() and
+    assign_new_trx_no() where new id must be allocated atomically with
+    payload of these methods from MVCC snapshot point of view.
+
+    @sa get_new_trx_id()
+    @sa assign_new_trx_no()
+
+    @return new transaction id
+  */
+
+  trx_id_t get_new_trx_id_no_refresh()
+  {
+    return m_max_trx_id++;
+  }
+};
+
+
+/** The transaction system */
+extern trx_sys_t trx_sys;
diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h
new file mode 100644
index 00000000..3cfbe331
--- /dev/null
+++ b/storage/innobase/include/trx0trx.h
@@ -0,0 +1,1268 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0trx.h
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0trx_h
+#define trx0trx_h
+
+#include "trx0types.h"
+#include "lock0types.h"
+#include "que0types.h"
+#include "mem0mem.h"
+#include "trx0xa.h"
+#include "ut0vec.h"
+#include "fts0fts.h"
+#include "read0types.h"
+#include "ilist.h"
+#include "row0merge.h"
+
+#include <vector>
+
+// Forward declaration
+struct mtr_t;
+struct rw_trx_hash_element_t;
+
+/******************************************************************//**
+Set detailed error message for the transaction. */
+void
+trx_set_detailed_error(
+/*===================*/
+	trx_t*		trx,	/*!< in: transaction struct */
+	const char*	msg);	/*!< in: detailed error message */
+/*************************************************************//**
+Set detailed error message for the transaction from a file. Note that the
+file is rewinded before reading from it. */
+void
+trx_set_detailed_error_from_file(
+/*=============================*/
+	trx_t*	trx,	/*!< in: transaction struct */
+	FILE*	file);	/*!< in: file to read message from */
+/****************************************************************//**
+Retrieves the error_info field from a trx.
+@return the error info */
+UNIV_INLINE
+const dict_index_t*
+trx_get_error_info(
+/*===============*/
+	const trx_t*	trx);	/*!< in: trx object */
+
+/** @return an allocated transaction */
+trx_t *trx_create();
+
+/** At shutdown, frees a transaction object. */
+void trx_free_at_shutdown(trx_t *trx);
+
+/** Disconnect a prepared transaction from MySQL.
+@param[in,out]	trx	transaction */
+void trx_disconnect_prepared(trx_t *trx);
+
+/** Initialize (resurrect) transactions at startup. */
+dberr_t trx_lists_init_at_db_start();
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+void
+trx_start_if_not_started_xa_low(
+/*============================*/
+	trx_t*	trx,		/*!< in/out: transaction */
+	bool	read_write);	/*!< in: true if read write transaction */
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+void
+trx_start_if_not_started_low(
+/*=========================*/
+	trx_t*	trx,		/*!< in/out: transaction */
+	bool	read_write);	/*!< in: true if read write transaction */
+
+/**
+Start a transaction for internal processing.
+@param trx          transaction
+@param read_write   whether writes may be performed */
+void trx_start_internal_low(trx_t *trx, bool read_write);
+
+#ifdef UNIV_DEBUG
+#define trx_start_if_not_started_xa(t, rw)			\
+	do {							\
+	(t)->start_line = __LINE__;				\
+	(t)->start_file = __FILE__;				\
+	trx_start_if_not_started_xa_low((t), rw);		\
+	} while (false)
+
+#define trx_start_if_not_started(t, rw)				\
+	do {							\
+	(t)->start_line = __LINE__;				\
+	(t)->start_file = __FILE__;				\
+	trx_start_if_not_started_low((t), rw);			\
+	} while (false)
+
+#define trx_start_internal(t)					\
+	do {							\
+	(t)->start_line = __LINE__;				\
+	(t)->start_file = __FILE__;				\
+	trx_start_internal_low(t, true);			\
+	} while (false)
+#define trx_start_internal_read_only(t)				\
+	do {							\
+	(t)->start_line = __LINE__;				\
+	(t)->start_file = __FILE__;				\
+	trx_start_internal_low(t, false);			\
+	} while (false)
+#else
+#define trx_start_if_not_started(t, rw)				\
+	trx_start_if_not_started_low((t), rw)
+
+#define trx_start_internal(t) trx_start_internal_low(t, true)
+#define trx_start_internal_read_only(t) trx_start_internal_low(t, false)
+
+#define trx_start_if_not_started_xa(t, rw)			\
+	trx_start_if_not_started_xa_low((t), (rw))
+#endif /* UNIV_DEBUG */
+
+/** Start a transaction for a DDL operation.
+@param trx   transaction */
+void trx_start_for_ddl_low(trx_t *trx);
+
+#ifdef UNIV_DEBUG
+# define trx_start_for_ddl(t)					\
+	do {							\
+	ut_ad((t)->start_file == 0);				\
+	(t)->start_line = __LINE__;				\
+	(t)->start_file = __FILE__;				\
+	trx_start_for_ddl_low(t);				\
+	} while (0)
+#else
+# define trx_start_for_ddl(t) trx_start_for_ddl_low(t)
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Does the transaction commit for MySQL.
+@return DB_SUCCESS or error number */
+dberr_t
+trx_commit_for_mysql(
+/*=================*/
+	trx_t*	trx);	/*!< in/out: transaction */
+/** XA PREPARE a transaction.
+@param[in,out]	trx	transaction to prepare */
+void trx_prepare_for_mysql(trx_t* trx);
+/**********************************************************************//**
+This function is used to find number of prepared transactions and
+their transaction objects for a recovery.
+@return number of prepared transactions */
+int
+trx_recover_for_mysql(
+/*==================*/
+	XID*	xid_list,	/*!< in/out: prepared transactions */
+	uint	len);		/*!< in: number of slots in xid_list */
+/** Look up an X/Open distributed transaction in XA PREPARE state.
+@param[in]	xid	X/Open XA transaction identifier
+@return	transaction on match (the trx_t::xid will be invalidated);
+note that the trx may have been committed before the caller acquires
+trx_t::mutex
+@retval	NULL if no match */
+trx_t* trx_get_trx_by_xid(const XID* xid);
+/** Durably write log until trx->commit_lsn
+(if trx_t::commit_in_memory() was invoked with flush_log_later=true). */
+void trx_commit_complete_for_mysql(trx_t *trx);
+/**********************************************************************//**
+Marks the latest SQL statement ended. */
+void
+trx_mark_sql_stat_end(
+/*==================*/
+	trx_t*	trx);	/*!< in: trx handle */
+/****************************************************************//**
+Prepares a transaction for commit/rollback. */
+void
+trx_commit_or_rollback_prepare(
+/*===========================*/
+	trx_t*	trx);	/*!< in/out: transaction */
+/*********************************************************************//**
+Creates a commit command node struct.
+@return own: commit node struct */
+commit_node_t*
+trx_commit_node_create(
+/*===================*/
+	mem_heap_t*	heap);	/*!< in: mem heap where created */
+/***********************************************************//**
+Performs an execution step for a commit type node in a query graph.
+@return query thread to run next, or NULL */
+que_thr_t*
+trx_commit_step(
+/*============*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+/**********************************************************************//**
+Prints info about a transaction. */
+void
+trx_print_low(
+/*==========*/
+	FILE*		f,
+			/*!< in: output stream */
+	const trx_t*	trx,
+			/*!< in: transaction */
+	ulint		max_query_len,
+			/*!< in: max query length to print,
+			or 0 to use the default max length */
+	ulint		n_rec_locks,
+			/*!< in: trx->lock.n_rec_locks */
+	ulint		n_trx_locks,
+			/*!< in: length of trx->lock.trx_locks */
+	ulint		heap_size);
+			/*!< in: mem_heap_get_size(trx->lock.lock_heap) */
+
+/**********************************************************************//**
+Prints info about a transaction.
+When possible, use trx_print() instead. */
+void
+trx_print_latched(
+/*==============*/
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len);	/*!< in: max query length to print,
+					or 0 to use the default max length */
+
+/**********************************************************************//**
+Prints info about a transaction.
+Acquires and releases lock_sys.latch. */
+void
+trx_print(
+/*======*/
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len);	/*!< in: max query length to print,
+					or 0 to use the default max length */
+
+/**********************************************************************//**
+Determines if a transaction is in the given state.
+The caller must hold trx->mutex, or it must be the thread
+that is serving a running transaction.
+A running RW transaction must be in trx_sys.rw_trx_hash.
+@return TRUE if trx->state == state */
+UNIV_INLINE
+bool
+trx_state_eq(
+/*=========*/
+	const trx_t*	trx,	/*!< in: transaction */
+	trx_state_t	state,	/*!< in: state;
+				if state != TRX_STATE_NOT_STARTED
+				asserts that
+				trx->state != TRX_STATE_NOT_STARTED */
+	bool		relaxed = false)
+				/*!< in: whether to allow
+				trx->state == TRX_STATE_NOT_STARTED
+				after an error has been reported */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**********************************************************************//**
+Determines if the currently running transaction has been interrupted.
+@return true if interrupted */
+bool
+trx_is_interrupted(
+/*===============*/
+	const trx_t*	trx);	/*!< in: transaction */
+
+/*******************************************************************//**
+Calculates the "weight" of a transaction. The weight of one transaction
+is estimated as the number of altered rows + the number of locked rows.
+@param t transaction
+@return transaction weight */
+#define TRX_WEIGHT(t)	((t)->undo_no + UT_LIST_GET_LEN((t)->lock.trx_locks))
+
+/** Create the trx_t pool */
+void
+trx_pool_init();
+
+/** Destroy the trx_t pool */
+void
+trx_pool_close();
+
+/**
+Set the transaction as a read-write transaction if it is not already
+tagged as such.
+@param[in,out] trx	Transaction that needs to be "upgraded" to RW from RO */
+void
+trx_set_rw_mode(
+	trx_t*		trx);
+
+/**
+Transactions that aren't started by the MySQL server don't set
+the trx_t::mysql_thd field. For such transactions we set the lock
+wait timeout to 0 instead of the user configured value that comes
+from innodb_lock_wait_timeout via trx_t::mysql_thd.
+@param trx transaction
+@return lock wait timeout in seconds */
+#define trx_lock_wait_timeout_get(t)					\
+	((t)->mysql_thd != NULL						\
+	 ? thd_lock_wait_timeout((t)->mysql_thd)			\
+	 : 0)
+
+typedef std::vector<ib_lock_t*, ut_allocator<ib_lock_t*> >	lock_list;
+
+/** The locks and state of an active transaction. Protected by
+lock_sys.latch, trx->mutex or both. */
+struct trx_lock_t
+{
+  /** Lock request being waited for.
+  Set to nonnull when holding lock_sys.latch, lock_sys.wait_mutex and
+  trx->mutex, by the thread that is executing the transaction.
+  Set to nullptr when holding lock_sys.wait_mutex. */
+  Atomic_relaxed<lock_t*> wait_lock;
+  /** Transaction being waited for; protected by lock_sys.wait_mutex */
+  trx_t *wait_trx;
+  /** condition variable for !wait_lock; used with lock_sys.wait_mutex */
+  pthread_cond_t cond;
+  /** lock wait start time */
+  Atomic_relaxed<my_hrtime_t> suspend_time;
+
+#if  defined(UNIV_DEBUG) || !defined(DBUG_OFF)
+  /** 2=high priority WSREP thread has marked this trx to abort;
+  1=another transaction chose this as a victim in deadlock resolution.
+
+  Other threads than the one that is executing the transaction may set
+  flags in this while holding lock_sys.wait_mutex. */
+  Atomic_relaxed<byte> was_chosen_as_deadlock_victim;
+
+  /** Flag the lock owner as a victim in Galera conflict resolution. */
+  void set_wsrep_victim()
+  {
+# if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    /* There is no 8-bit version of the 80386 BTS instruction.
+    Technically, this is the wrong addressing mode (16-bit), but
+    there are other data members stored after the byte. */
+    __asm__ __volatile__("lock btsw $1, %0"
+                         : "+m" (was_chosen_as_deadlock_victim));
+# else
+    was_chosen_as_deadlock_victim.fetch_or(2);
+# endif
+  }
+#else /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */
+
+  /** High priority WSREP thread has marked this trx to abort or
+  another transaction chose this as a victim in deadlock resolution.
+
+  Other threads than the one that is executing the transaction may set
+  this while holding lock_sys.wait_mutex. */
+  Atomic_relaxed<bool> was_chosen_as_deadlock_victim;
+
+  /** Flag the lock owner as a victim in Galera conflict resolution. */
+  void set_wsrep_victim() { was_chosen_as_deadlock_victim= true; }
+#endif /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */
+
+  /** Next available rec_pool[] entry */
+  byte rec_cached;
+  /** Next available table_pool[] entry */
+  byte table_cached;
+
+	que_thr_t*	wait_thr;	/*!< query thread belonging to this
+					trx that is in waiting
+					state. For threads suspended in a
+					lock wait, this is protected by
+					lock_sys.latch. Otherwise, this may
+					only be modified by the thread that is
+					serving the running transaction. */
+
+  /** Pre-allocated record locks */
+  struct {
+    alignas(CPU_LEVEL1_DCACHE_LINESIZE) ib_lock_t lock;
+  } rec_pool[8];
+
+  /** Pre-allocated table locks */
+  ib_lock_t table_pool[8];
+
+  /** Memory heap for trx_locks. Protected by lock_sys.assert_locked()
+  and lock_sys.is_writer() || trx->mutex_is_owner(). */
+  mem_heap_t *lock_heap;
+
+  /** Locks held by the transaction. Protected by lock_sys.assert_locked()
+  and lock_sys.is_writer() || trx->mutex_is_owner().
+  (If lock_sys.latch is only held in shared mode, then the modification
+  must be protected by trx->mutex.) */
+  trx_lock_list_t trx_locks;
+
+	lock_list	table_locks;	/*!< All table locks requested by this
+					transaction, including AUTOINC locks */
+
+	/** List of pending trx_t::evict_table() */
+	UT_LIST_BASE_NODE_T(dict_table_t) evicted_tables;
+
+  /** number of record locks; protected by lock_sys.assert_locked(page_id) */
+  ulint n_rec_locks;
+};
+
+/** Logical first modification time of a table in a transaction */
+class trx_mod_table_time_t
+{
+  /** Impossible value for trx_t::undo_no */
+  static constexpr undo_no_t NONE= ~undo_no_t{0};
+  /** Theoretical maximum value for trx_t::undo_no.
+  DB_ROLL_PTR is only 7 bytes, so it cannot point to more than
+  this many undo log records. */
+  static constexpr undo_no_t LIMIT= (undo_no_t{1} << (7 * 8)) - 1;
+
+  /** Flag in 'first' to indicate that subsequent operations are
+  covered by a TRX_UNDO_EMPTY record (for the first statement to
+  insert into an empty table) */
+  static constexpr undo_no_t BULK= 1ULL << 63;
+
+  /** First modification of the table, possibly ORed with BULK */
+  undo_no_t first;
+  /** First modification of a system versioned column
+  (NONE= no versioning, BULK= the table was dropped) */
+  undo_no_t first_versioned= NONE;
+#ifdef UNIV_DEBUG
+  /** Whether the modified table is a FTS auxiliary table */
+  bool fts_aux_table= false;
+#endif /* UNIV_DEBUG */
+
+  /** Buffer to store insert opertion */
+  row_merge_bulk_t *bulk_store= nullptr;
+
+  friend struct trx_t;
+public:
+  /** Constructor
+  @param rows   number of modified rows so far */
+  trx_mod_table_time_t(undo_no_t rows) : first(rows) { ut_ad(rows < LIMIT); }
+
+#ifdef UNIV_DEBUG
+  /** Validation
+  @param rows   number of modified rows so far
+  @return whether the object is valid */
+  bool valid(undo_no_t rows= NONE) const
+  { auto f= first & LIMIT; return f <= first_versioned && f <= rows; }
+#endif /* UNIV_DEBUG */
+  /** @return if versioned columns were modified */
+  bool is_versioned() const { return (~first_versioned & LIMIT) != 0; }
+  /** @return if the table was dropped */
+  bool is_dropped() const { return first_versioned == BULK; }
+
+  /** After writing an undo log record, set is_versioned() if needed
+  @param rows   number of modified rows so far */
+  void set_versioned(undo_no_t rows)
+  {
+    ut_ad(first_versioned == NONE);
+    first_versioned= rows;
+    ut_ad(valid(rows));
+  }
+
+  /** After writing an undo log record, note that the table will be dropped */
+  void set_dropped()
+  {
+    ut_ad(first_versioned == NONE);
+    first_versioned= BULK;
+  }
+
+  /** Notify the start of a bulk insert operation
+  @param table table to do bulk operation */
+  void start_bulk_insert(dict_table_t *table)
+  {
+    first|= BULK;
+    if (!table->is_temporary())
+      bulk_store= new row_merge_bulk_t(table);
+  }
+
+  /** Notify the end of a bulk insert operation */
+  void end_bulk_insert() { first&= ~BULK; }
+
+  /** @return whether an insert is covered by TRX_UNDO_EMPTY record */
+  bool is_bulk_insert() const { return first & BULK; }
+
+  /** Invoked after partial rollback
+  @param limit	number of surviving modified rows (trx_t::undo_no)
+  @return	whether this should be erased from trx_t::mod_tables */
+  bool rollback(undo_no_t limit)
+  {
+    ut_ad(valid());
+    if ((LIMIT & first) >= limit)
+      return true;
+    if (first_versioned < limit)
+      first_versioned= NONE;
+    return false;
+  }
+
+#ifdef UNIV_DEBUG
+  void set_aux_table() { fts_aux_table= true; }
+
+  bool is_aux_table() const { return fts_aux_table; }
+#endif /* UNIV_DEBUG */
+
+  /** @return the first undo record that modified the table */
+  undo_no_t get_first() const
+  {
+    ut_ad(valid());
+    return LIMIT & first;
+  }
+
+  /** Add the tuple to the transaction bulk buffer for the given index.
+  @param entry  tuple to be inserted
+  @param index  bulk insert for the index
+  @param trx    transaction */
+  dberr_t bulk_insert_buffered(const dtuple_t &entry,
+                               const dict_index_t &index, trx_t *trx)
+  {
+    return bulk_store->bulk_insert_buffered(entry, index, trx);
+  }
+
+  /** Do bulk insert operation present in the buffered operation
+  @return DB_SUCCESS or error code */
+  dberr_t write_bulk(dict_table_t *table, trx_t *trx);
+
+  /** @return whether the buffer storage exist */
+  bool bulk_buffer_exist() const
+  {
+    return bulk_store && is_bulk_insert();
+  }
+
+  /** Free bulk insert operation */
+  void clear_bulk_buffer()
+  {
+    delete bulk_store;
+    bulk_store= nullptr;
+  }
+};
+
+/** Collection of persistent tables and their first modification
+in a transaction.
+We store pointers to the table objects in memory because
+we know that a table object will not be destroyed while a transaction
+that modified it is running. */
+typedef std::map<
+	dict_table_t*, trx_mod_table_time_t,
+	std::less<dict_table_t*>,
+	ut_allocator<std::pair<dict_table_t* const, trx_mod_table_time_t> > >
+	trx_mod_tables_t;
+
+/** The transaction handle
+
+Normally, there is a 1:1 relationship between a transaction handle
+(trx) and a session (client connection). One session is associated
+with exactly one user transaction. There are some exceptions to this:
+
+* For DDL operations, a subtransaction is allocated that modifies the
+data dictionary tables. Lock waits and deadlocks are prevented by
+acquiring the dict_sys.latch before starting the subtransaction
+and releasing it after committing the subtransaction.
+
+* The purge system uses a special transaction that is not associated
+with any session.
+
+* If the system crashed or it was quickly shut down while there were
+transactions in the ACTIVE or PREPARED state, these transactions would
+no longer be associated with a session when the server is restarted.
+
+A session may be served by at most one thread at a time. The serving
+thread of a session might change in some MySQL implementations.
+Therefore we do not have pthread_self() assertions in the code.
+
+Normally, only the thread that is currently associated with a running
+transaction may access (read and modify) the trx object, and it may do
+so without holding any mutex. The following are exceptions to this:
+
+* trx_rollback_recovered() may access resurrected (connectionless)
+transactions (state == TRX_STATE_ACTIVE && is_recovered)
+while the system is already processing new user transactions (!is_recovered).
+
+* trx_print_low() may access transactions not associated with the current
+thread. The caller must be holding lock_sys.latch.
+
+* When a transaction handle is in the trx_sys.trx_list, some of its fields
+must not be modified without holding trx->mutex.
+
+* The locking code (in particular, lock_deadlock_recursive() and
+lock_rec_convert_impl_to_expl()) will access transactions associated
+to other connections. The locks of transactions are protected by
+lock_sys.latch (insertions also by trx->mutex). */
+
+/** Represents an instance of rollback segment along with its state variables.*/
+struct trx_undo_ptr_t {
+	trx_rseg_t*	rseg;		/*!< rollback segment assigned to the
+					transaction, or NULL if not assigned
+					yet */
+	trx_undo_t*	undo;		/*!< pointer to the undo log, or
+					NULL if nothing logged yet */
+};
+
+/** An instance of temporary rollback segment. */
+struct trx_temp_undo_t {
+	/** temporary rollback segment, or NULL if not assigned yet */
+	trx_rseg_t*	rseg;
+	/** pointer to the undo log, or NULL if nothing logged yet */
+	trx_undo_t*	undo;
+};
+
+/** Rollback segments assigned to a transaction for undo logging. */
+struct trx_rsegs_t {
+	/** undo log ptr holding reference to a rollback segment that resides in
+	system/undo tablespace used for undo logging of tables that needs
+	to be recovered on crash. */
+	trx_undo_ptr_t	m_redo;
+
+	/** undo log for temporary tables; discarded immediately after
+	transaction commit/rollback */
+	trx_temp_undo_t	m_noredo;
+};
+
+struct trx_t : ilist_node<>
+{
+private:
+  /**
+    Least significant 31 bits is count of references.
+
+    We can't release the locks nor commit the transaction until this reference
+    is 0. We can change the state to TRX_STATE_COMMITTED_IN_MEMORY to signify
+    that it is no longer "active".
+
+    If the most significant bit is set this transaction should stop inheriting
+    (GAP)locks. Generally set to true during transaction prepare for RC or lower
+    isolation, if requested. Needed for replication replay where
+    we don't want to get blocked on GAP locks taken for protecting
+    concurrent unique insert or replace operation.
+  */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE)
+  Atomic_relaxed<uint32_t> skip_lock_inheritance_and_n_ref;
+
+
+public:
+  /** Transaction identifier (0 if no locks were acquired).
+  Set by trx_sys_t::register_rw() or trx_resurrect() before
+  the transaction is added to trx_sys.rw_trx_hash.
+  Cleared in commit_in_memory() after commit_state(),
+  trx_sys_t::deregister_rw(), release_locks(). */
+  trx_id_t id;
+  /** The largest encountered transaction identifier for which no
+  transaction was observed to be active. This is a cache to speed up
+  trx_sys_t::find_same_or_older(). */
+  trx_id_t max_inactive_id;
+
+private:
+  /** mutex protecting state and some of lock
+  (some are protected by lock_sys.latch) */
+  srw_spin_mutex mutex;
+#ifdef UNIV_DEBUG
+  /** The owner of mutex (0 if none); protected by mutex */
+  std::atomic<pthread_t> mutex_owner{0};
+#endif /* UNIV_DEBUG */
+public:
+  void mutex_init() { mutex.init(); }
+  void mutex_destroy() { mutex.destroy(); }
+
+  /** Acquire the mutex */
+  void mutex_lock()
+  {
+    ut_ad(!mutex_is_owner());
+    mutex.wr_lock();
+    ut_ad(!mutex_owner.exchange(pthread_self(),
+                                std::memory_order_relaxed));
+  }
+  /** Release the mutex */
+  void mutex_unlock()
+  {
+    ut_ad(mutex_owner.exchange(0, std::memory_order_relaxed)
+	  == pthread_self());
+    mutex.wr_unlock();
+  }
+#ifndef SUX_LOCK_GENERIC
+  bool mutex_is_locked() const noexcept { return mutex.is_locked(); }
+#endif
+#ifdef UNIV_DEBUG
+  /** @return whether the current thread holds the mutex */
+  bool mutex_is_owner() const
+  {
+    return mutex_owner.load(std::memory_order_relaxed) ==
+      pthread_self();
+  }
+#endif /* UNIV_DEBUG */
+
+  /** State of the trx from the point of view of concurrency control
+  and the valid state transitions.
+
+  Possible states:
+
+  TRX_STATE_NOT_STARTED
+  TRX_STATE_ACTIVE
+  TRX_STATE_PREPARED
+  TRX_STATE_PREPARED_RECOVERED (special case of TRX_STATE_PREPARED)
+  TRX_STATE_COMMITTED_IN_MEMORY (alias below COMMITTED)
+
+  Valid state transitions are:
+
+  Regular transactions:
+  * NOT_STARTED -> ACTIVE -> COMMITTED -> NOT_STARTED
+
+  Auto-commit non-locking read-only:
+  * NOT_STARTED -> ACTIVE -> NOT_STARTED
+
+  XA (2PC):
+  * NOT_STARTED -> ACTIVE -> PREPARED -> COMMITTED -> NOT_STARTED
+
+  Recovered XA:
+  * NOT_STARTED -> PREPARED -> COMMITTED -> (freed)
+
+  Recovered XA followed by XA ROLLBACK:
+  * NOT_STARTED -> PREPARED -> ACTIVE -> COMMITTED -> (freed)
+
+  XA (2PC) (shutdown or disconnect before ROLLBACK or COMMIT):
+  * NOT_STARTED -> PREPARED -> (freed)
+
+  Disconnected XA PREPARE transaction can become recovered:
+  * ... -> ACTIVE -> PREPARED (connected) -> PREPARED (disconnected)
+
+  Latching and various transaction lists membership rules:
+
+  XA (2PC) transactions are always treated as non-autocommit.
+
+  Transitions to ACTIVE or NOT_STARTED occur when transaction
+  is not in rw_trx_hash.
+
+  Autocommit non-locking read-only transactions move between states
+  without holding any mutex. They are not in rw_trx_hash.
+
+  All transactions, unless they are determined to be ac-nl-ro,
+  explicitly tagged as read-only or read-write, will first be put
+  on the read-only transaction list. Only when a !read-only transaction
+  in the read-only list tries to acquire an X or IX lock on a table
+  do we remove it from the read-only list and put it on the read-write
+  list. During this switch we assign it a rollback segment.
+
+  When a transaction is NOT_STARTED, it can be in trx_list. It cannot be
+  in rw_trx_hash.
+
+  ACTIVE->PREPARED->COMMITTED is only possible when trx is in rw_trx_hash.
+  The transition ACTIVE->PREPARED is protected by trx->mutex.
+
+  ACTIVE->COMMITTED is possible when the transaction is in
+  rw_trx_hash.
+
+  Transitions to COMMITTED are protected by trx_t::mutex. */
+  Atomic_relaxed<trx_state_t> state;
+
+  /** The locks of the transaction. Protected by lock_sys.latch
+  (insertions also by trx_t::mutex). */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) trx_lock_t lock;
+
+#ifdef WITH_WSREP
+  /** whether wsrep_on(mysql_thd) held at the start of transaction */
+  byte wsrep;
+  bool is_wsrep() const { return UNIV_UNLIKELY(wsrep); }
+  bool is_wsrep_UK_scan() const { return UNIV_UNLIKELY(wsrep & 2); }
+#else /* WITH_WSREP */
+  bool is_wsrep() const { return false; }
+#endif /* WITH_WSREP */
+
+  /** Consistent read view of the transaction */
+  ReadView read_view;
+
+	/* These fields are not protected by any mutex. */
+
+	/** false=normal transaction, true=recovered (must be rolled back)
+	or disconnected transaction in XA PREPARE STATE.
+
+	This field is accessed by the thread that owns the transaction,
+	without holding any mutex.
+	There is only one foreign-thread access in trx_print_low()
+	and a possible race condition with trx_disconnect_prepared(). */
+	bool		is_recovered;
+	const char*	op_info;	/*!< English text describing the
+					current operation, or an empty
+					string */
+	uint		isolation_level;/*!< TRX_ISO_REPEATABLE_READ, ... */
+	bool		check_foreigns;	/*!< normally TRUE, but if the user
+					wants to suppress foreign key checks,
+					(in table imports, for example) we
+					set this FALSE */
+  /** whether an insert into an empty table is active */
+  bool bulk_insert;
+	/*------------------------------*/
+	/* MySQL has a transaction coordinator to coordinate two phase
+	commit between multiple storage engines and the binary log. When
+	an engine participates in a transaction, it's responsible for
+	registering itself using the trans_register_ha() API. */
+	bool		is_registered;	/* This flag is set to true after the
+					transaction has been registered with
+					the coordinator using the XA API, and
+					is set to false  after commit or
+					rollback. */
+	/** whether this is holding the prepare mutex */
+	bool		active_commit_ordered;
+	/*------------------------------*/
+	bool		check_unique_secondary;
+					/*!< normally TRUE, but if the user
+					wants to speed up inserts by
+					suppressing unique key checks
+					for secondary indexes when we decide
+					if we can use the insert buffer for
+					them, we set this FALSE */
+	bool		flush_log_later;/* In 2PC, we hold the
+					prepare_commit mutex across
+					both phases. In that case, we
+					defer flush of the logs to disk
+					until after we release the
+					mutex. */
+	ulint		duplicates;	/*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */
+  /** whether this modifies InnoDB dictionary tables */
+  bool dict_operation;
+#ifdef UNIV_DEBUG
+  /** copy of dict_operation during commit() */
+  bool was_dict_operation;
+#endif
+	/** whether dict_sys.latch is held exclusively; protected by
+	dict_sys.latch */
+	bool dict_operation_lock_mode;
+
+	/** wall-clock time of the latest transition to TRX_STATE_ACTIVE;
+	used for diagnostic purposes only */
+	time_t		start_time;
+	/** microsecond_interval_timer() of transaction start */
+	ulonglong	start_time_micro;
+	lsn_t		commit_lsn;	/*!< lsn at the time of the commit */
+	/*------------------------------*/
+	THD*		mysql_thd;	/*!< MySQL thread handle corresponding
+					to this trx, or NULL */
+
+	const char*	mysql_log_file_name;
+					/*!< if MySQL binlog is used, this field
+					contains a pointer to the latest file
+					name; this is NULL if binlog is not
+					used */
+	ulonglong	mysql_log_offset;
+					/*!< if MySQL binlog is used, this
+					field contains the end offset of the
+					binlog entry */
+	/*------------------------------*/
+	ib_uint32_t	n_mysql_tables_in_use; /*!< number of Innobase tables
+					used in the processing of the current
+					SQL statement in MySQL */
+	ib_uint32_t	mysql_n_tables_locked;
+					/*!< how many tables the current SQL
+					statement uses, except those
+					in consistent read */
+
+  /** DB_SUCCESS or error code; usually only the thread that is running
+  the transaction is allowed to modify this field. The only exception is
+  when a thread invokes lock_sys_t::cancel() in order to abort a
+  lock_wait(). That is protected by lock_sys.wait_mutex and lock.wait_lock. */
+  dberr_t error_state;
+
+	const dict_index_t*error_info;	/*!< if the error number indicates a
+					duplicate key error, a pointer to
+					the problematic index is stored here */
+	ulint		error_key_num;	/*!< if the index creation fails to a
+					duplicate key error, a mysql key
+					number of that index is stored here */
+	que_t*		graph;		/*!< query currently run in the session,
+					or NULL if none; NOTE that the query
+					belongs to the session, and it can
+					survive over a transaction commit, if
+					it is a stored procedure with a COMMIT
+					WORK statement, for instance */
+	/*------------------------------*/
+	UT_LIST_BASE_NODE_T(trx_named_savept_t)
+			trx_savepoints;	/*!< savepoints set with SAVEPOINT ...,
+					oldest first */
+	/*------------------------------*/
+	undo_no_t	undo_no;	/*!< next undo log record number to
+					assign; since the undo log is
+					private for a transaction, this
+					is a simple ascending sequence
+					with no gaps; thus it represents
+					the number of modified/inserted
+					rows in a transaction */
+	trx_savept_t	last_sql_stat_start;
+					/*!< undo_no when the last sql statement
+					was started: in case of an error, trx
+					is rolled back down to this number */
+	trx_rsegs_t	rsegs;		/* rollback segments for undo logging */
+	undo_no_t	roll_limit;	/*!< least undo number to undo during
+					a partial rollback; 0 otherwise */
+	bool		in_rollback;	/*!< true when the transaction is
+					executing a partial or full rollback */
+	ulint		pages_undone;	/*!< number of undo log pages undone
+					since the last undo log truncation */
+	/*------------------------------*/
+	ulint		n_autoinc_rows;	/*!< no. of AUTO-INC rows required for
+					an SQL statement. This is useful for
+					multi-row INSERTs */
+	ib_vector_t*    autoinc_locks;  /* AUTOINC locks held by this
+					transaction. Note that these are
+					also in the lock list trx_locks. This
+					vector needs to be freed explicitly
+					when the trx instance is destroyed.
+					Protected by lock_sys.latch. */
+	/*------------------------------*/
+	bool		read_only;	/*!< true if transaction is flagged
+					as a READ-ONLY transaction.
+					if auto_commit && !will_lock
+					then it will be handled as a
+					AC-NL-RO-SELECT (Auto Commit Non-Locking
+					Read Only Select). A read only
+					transaction will not be assigned an
+					UNDO log. */
+	bool		auto_commit;	/*!< true if it is an autocommit */
+	bool		will_lock;	/*!< set to inform trx_start_low() that
+					the transaction may acquire locks */
+	/* True if transaction has to read the undo log and
+	log the DML changes for online DDL table */
+	bool		apply_online_log = false;
+
+	/*------------------------------*/
+	fts_trx_t*	fts_trx;	/*!< FTS information, or NULL if
+					transaction hasn't modified tables
+					with FTS indexes (yet). */
+	doc_id_t	fts_next_doc_id;/* The document id used for updates */
+	/*------------------------------*/
+	ib_uint32_t	flush_tables;	/*!< if "covering" the FLUSH TABLES",
+					count of tables being flushed. */
+
+	/*------------------------------*/
+#ifdef UNIV_DEBUG
+	unsigned	start_line;	/*!< Track where it was started from */
+	const char*	start_file;	/*!< Filename where it was started */
+#endif /* UNIV_DEBUG */
+
+	XID		xid;		/*!< X/Open XA transaction
+					identification to identify a
+					transaction branch */
+	trx_mod_tables_t mod_tables;	/*!< List of tables that were modified
+					by this transaction */
+	/*------------------------------*/
+	char*		detailed_error;	/*!< detailed error message for last
+					error, or empty. */
+	rw_trx_hash_element_t *rw_trx_hash_element;
+	LF_PINS *rw_trx_hash_pins;
+	ulint		magic_n;
+
+	/** @return whether any persistent undo log has been generated */
+	bool has_logged_persistent() const
+	{
+		return(rsegs.m_redo.undo);
+	}
+
+	/** @return whether any undo log has been generated */
+	bool has_logged() const
+	{
+		return(has_logged_persistent() || rsegs.m_noredo.undo);
+	}
+
+	/** @return rollback segment for modifying temporary tables */
+	trx_rseg_t* get_temp_rseg()
+	{
+		if (trx_rseg_t* rseg = rsegs.m_noredo.rseg) {
+			ut_ad(id != 0);
+			return(rseg);
+		}
+
+		return(assign_temp_rseg());
+	}
+
+  /** Transition to committed state, to release implicit locks. */
+  inline void commit_state();
+
+  /** Release any explicit locks of a committing transaction. */
+  inline void release_locks();
+
+  /** Evict a table definition due to the rollback of ALTER TABLE.
+  @param table_id   table identifier
+  @param reset_only whether to only reset dict_table_t::def_trx_id */
+  void evict_table(table_id_t table_id, bool reset_only= false);
+
+  /** Initiate rollback.
+  @param savept     savepoint to which to roll back
+  @return error code or DB_SUCCESS */
+  dberr_t rollback(trx_savept_t *savept= nullptr);
+  /** Roll back an active transaction.
+  @param savept     savepoint to which to roll back */
+  inline void rollback_low(trx_savept_t *savept= nullptr);
+  /** Finish rollback.
+  @return whether the rollback was completed normally
+  @retval false if the rollback was aborted by shutdown */
+  inline bool rollback_finish();
+private:
+  /** Apply any changes to tables for which online DDL is in progress. */
+  ATTRIBUTE_COLD void apply_log();
+  /** Process tables that were modified by the committing transaction. */
+  inline void commit_tables();
+  /** Mark a transaction committed in the main memory data structures.
+  @param mtr  mini-transaction (if there are any persistent modifications) */
+  inline void commit_in_memory(const mtr_t *mtr);
+  /** Write log for committing the transaction. */
+  void commit_persist();
+  /** Clean up the transaction after commit_in_memory() */
+  void commit_cleanup();
+  /** Commit the transaction in a mini-transaction.
+  @param mtr  mini-transaction (if there are any persistent modifications) */
+  void commit_low(mtr_t *mtr= nullptr);
+  /** Commit an empty transaction.
+  @param mtr   mini-transaction */
+  void commit_empty(mtr_t *mtr);
+  /** Commit an empty transaction.
+  @param mtr   mini-transaction */
+  /** Assign the transaction its history serialisation number and write the
+  UNDO log to the assigned rollback segment.
+  @param mtr   mini-transaction */
+  inline void write_serialisation_history(mtr_t *mtr);
+public:
+  /** Commit the transaction. */
+  void commit();
+
+  /** Try to drop a persistent table.
+  @param table       persistent table
+  @param fk          whether to drop FOREIGN KEY metadata
+  @return error code */
+  dberr_t drop_table(const dict_table_t &table);
+  /** Try to drop the foreign key constraints for a persistent table.
+  @param name        name of persistent table
+  @return error code */
+  dberr_t drop_table_foreign(const table_name_t &name);
+  /** Try to drop the statistics for a persistent table.
+  @param name        name of persistent table
+  @return error code */
+  dberr_t drop_table_statistics(const table_name_t &name);
+  /** Commit the transaction, possibly after drop_table().
+  @param deleted   handles of data files that were deleted */
+  void commit(std::vector<pfs_os_file_t> &deleted);
+
+
+  /** Discard all savepoints */
+  void savepoints_discard()
+  { savepoints_discard(UT_LIST_GET_FIRST(trx_savepoints)); }
+
+
+  /** Discard all savepoints starting from a particular savepoint.
+  @param savept    first savepoint to discard */
+  void savepoints_discard(trx_named_savept_t *savept);
+
+
+  bool is_referenced() const
+  {
+    return (skip_lock_inheritance_and_n_ref & ~(1U << 31)) > 0;
+  }
+
+
+  void reference()
+  {
+    ut_d(auto old_n_ref =)
+    skip_lock_inheritance_and_n_ref.fetch_add(1);
+    ut_ad(int32_t(old_n_ref << 1) >= 0);
+  }
+
+  void release_reference()
+  {
+    ut_d(auto old_n_ref =)
+    skip_lock_inheritance_and_n_ref.fetch_sub(1);
+    ut_ad(int32_t(old_n_ref << 1) > 0);
+  }
+
+  bool is_not_inheriting_locks() const
+  {
+    return skip_lock_inheritance_and_n_ref >> 31;
+  }
+
+  void set_skip_lock_inheritance()
+  {
+    ut_d(auto old_n_ref=) skip_lock_inheritance_and_n_ref.fetch_add(1U << 31);
+    ut_ad(!(old_n_ref >> 31));
+  }
+
+  void reset_skip_lock_inheritance()
+  {
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    __asm__("lock btrl $31, %0" : : "m"(skip_lock_inheritance_and_n_ref));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+    _interlockedbittestandreset(
+        reinterpret_cast<volatile long *>(&skip_lock_inheritance_and_n_ref),
+        31);
+#else
+    skip_lock_inheritance_and_n_ref.fetch_and(~1U << 31);
+#endif
+  }
+
+  /** @return whether the table has lock on
+  mysql.innodb_table_stats or mysql.innodb_index_stats */
+  bool has_stats_table_lock() const;
+
+  /** Free the memory to trx_pools */
+  void free();
+
+
+  void assert_freed() const
+  {
+    ut_ad(state == TRX_STATE_NOT_STARTED);
+    ut_ad(!id);
+    ut_ad(!mutex_is_owner());
+    ut_ad(!has_logged());
+    ut_ad(!is_referenced());
+    ut_ad(!is_wsrep());
+    ut_ad(!lock.was_chosen_as_deadlock_victim);
+    ut_ad(mod_tables.empty());
+    ut_ad(!read_view.is_open());
+    ut_ad(!lock.wait_thr);
+    ut_ad(!lock.wait_lock);
+    ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0);
+    ut_ad(lock.table_locks.empty());
+    ut_ad(!autoinc_locks || ib_vector_is_empty(autoinc_locks));
+    ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0);
+    ut_ad(!dict_operation);
+    ut_ad(!apply_online_log);
+    ut_ad(!is_not_inheriting_locks());
+    ut_ad(check_foreigns);
+    ut_ad(check_unique_secondary);
+  }
+
+  /** This has to be invoked on SAVEPOINT or at the end of a statement.
+  Even if a TRX_UNDO_EMPTY record was written for this table to cover an
+  insert into an empty table, subsequent operations will have to be covered
+  by row-level undo log records, so that ROLLBACK TO SAVEPOINT or a
+  rollback to the start of a statement will work.
+  @param table   table on which any preceding bulk insert ended */
+  void end_bulk_insert(const dict_table_t &table)
+  {
+    auto it= mod_tables.find(const_cast<dict_table_t*>(&table));
+    if (it != mod_tables.end())
+      it->second.end_bulk_insert();
+  }
+
+  /** @return whether this is a non-locking autocommit transaction */
+  bool is_autocommit_non_locking() const { return auto_commit && !will_lock; }
+
+  /** This has to be invoked on SAVEPOINT or at the start of a statement.
+  Even if TRX_UNDO_EMPTY records were written for any table to cover an
+  insert into an empty table, subsequent operations will have to be covered
+  by row-level undo log records, so that ROLLBACK TO SAVEPOINT or a
+  rollback to the start of a statement will work. */
+  void end_bulk_insert()
+  {
+    for (auto& t : mod_tables)
+      t.second.end_bulk_insert();
+  }
+
+  /** @return whether a bulk insert into empty table is in progress */
+  bool is_bulk_insert() const
+  {
+    if (!bulk_insert || check_unique_secondary || check_foreigns)
+      return false;
+    for (const auto& t : mod_tables)
+      if (t.second.is_bulk_insert())
+        return true;
+    return false;
+  }
+
+  /** @return logical modification time of a table only
+  if the table has bulk buffer exist in the transaction */
+  trx_mod_table_time_t *check_bulk_buffer(dict_table_t *table)
+  {
+    if (UNIV_LIKELY(!bulk_insert))
+      return nullptr;
+    ut_ad(!check_unique_secondary);
+    ut_ad(!check_foreigns);
+    auto it= mod_tables.find(table);
+    if (it == mod_tables.end() || !it->second.bulk_buffer_exist())
+      return nullptr;
+    return &it->second;
+  }
+
+  /** Do the bulk insert for the buffered insert operation
+  for the transaction.
+  @return DB_SUCCESS or error code */
+  dberr_t bulk_insert_apply()
+  {
+    return UNIV_UNLIKELY(bulk_insert) ? bulk_insert_apply_low(): DB_SUCCESS;
+  }
+
+private:
+  /** Apply the buffered bulk inserts. */
+  dberr_t bulk_insert_apply_low();
+
+  /** Assign a rollback segment for modifying temporary tables.
+  @return the assigned rollback segment */
+  trx_rseg_t *assign_temp_rseg();
+};
+
+/**
+Check if transaction is started.
+@param[in] trx		Transaction whose state we need to check
+@reutrn true if transaction is in state started */
+inline bool trx_is_started(const trx_t* trx)
+{
+	return trx->state != TRX_STATE_NOT_STARTED;
+}
+
+/* Transaction isolation levels (trx->isolation_level) */
+#define TRX_ISO_READ_UNCOMMITTED	0	/* dirty read: non-locking
+						SELECTs are performed so that
+						we do not look at a possible
+						earlier version of a record;
+						thus they are not 'consistent'
+						reads under this isolation
+						level; otherwise like level
+						2 */
+
+#define TRX_ISO_READ_COMMITTED		1	/* somewhat Oracle-like
+						isolation, except that in
+						range UPDATE and DELETE we
+						must block phantom rows
+						with next-key locks;
+						SELECT ... FOR UPDATE and ...
+						LOCK IN SHARE MODE only lock
+						the index records, NOT the
+						gaps before them, and thus
+						allow free inserting;
+						each consistent read reads its
+						own snapshot */
+
+#define TRX_ISO_REPEATABLE_READ		2	/* this is the default;
+						all consistent reads in the
+						same trx read the same
+						snapshot;
+						full next-key locking used
+						in locking reads to block
+						insertions into gaps */
+
+#define TRX_ISO_SERIALIZABLE		3	/* all plain SELECTs are
+						converted to LOCK IN SHARE
+						MODE reads */
+
+/* Treatment of duplicate values (trx->duplicates; for example, in inserts).
+Multiple flags can be combined with bitwise OR. */
+#define TRX_DUP_IGNORE	1U	/* duplicate rows are to be updated */
+#define TRX_DUP_REPLACE	2U	/* duplicate rows are to be replaced */
+
+
+/** Commit node states */
+enum commit_node_state {
+	COMMIT_NODE_SEND = 1,	/*!< about to send a commit signal to
+				the transaction */
+	COMMIT_NODE_WAIT	/*!< commit signal sent to the transaction,
+				waiting for completion */
+};
+
+/** Commit command node in a query graph */
+struct commit_node_t{
+	que_common_t	common;	/*!< node type: QUE_NODE_COMMIT */
+	enum commit_node_state
+			state;	/*!< node execution state */
+};
+
+
+#include "trx0trx.inl"
+
+#endif
diff --git a/storage/innobase/include/trx0trx.inl b/storage/innobase/include/trx0trx.inl
new file mode 100644
index 00000000..b063c920
--- /dev/null
+++ b/storage/innobase/include/trx0trx.inl
@@ -0,0 +1,86 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0trx.ic
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+/**********************************************************************//**
+Determines if a transaction is in the given state.
+The caller must hold trx->mutex, or it must be the thread
+that is serving a running transaction.
+A running RW transaction must be in trx_sys.rw_trx_hash.
+@return TRUE if trx->state == state */
+UNIV_INLINE
+bool
+trx_state_eq(
+/*=========*/
+	const trx_t*	trx,	/*!< in: transaction */
+	trx_state_t	state,	/*!< in: state;
+				if state != TRX_STATE_NOT_STARTED
+				asserts that
+				trx->state != TRX_STATE_NOT_STARTED */
+	bool		relaxed)
+				/*!< in: whether to allow
+				trx->state == TRX_STATE_NOT_STARTED
+				after an error has been reported */
+{
+#ifdef UNIV_DEBUG
+	switch (trx->state) {
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		ut_ad(!trx->is_autocommit_non_locking());
+		return(trx->state == state);
+
+	case TRX_STATE_ACTIVE:
+		if (trx->is_autocommit_non_locking()) {
+			ut_ad(!trx->is_recovered);
+			ut_ad(trx->read_only);
+			ut_ad(trx->mysql_thd);
+		}
+		return(state == trx->state);
+
+	case TRX_STATE_NOT_STARTED:
+		/* These states are not allowed for running transactions. */
+		ut_a(state == TRX_STATE_NOT_STARTED
+		     || (relaxed
+			 && thd_get_error_number(trx->mysql_thd)));
+
+		return(true);
+	}
+	ut_error;
+#endif /* UNIV_DEBUG */
+	return(trx->state == state);
+}
+
+/****************************************************************//**
+Retrieves the error_info field from a trx.
+@return the error info */
+UNIV_INLINE
+const dict_index_t*
+trx_get_error_info(
+/*===============*/
+	const trx_t*	trx)	/*!< in: trx object */
+{
+	return(trx->error_info);
+}
diff --git a/storage/innobase/include/trx0types.h b/storage/innobase/include/trx0types.h
new file mode 100644
index 00000000..bfa2adc0
--- /dev/null
+++ b/storage/innobase/include/trx0types.h
@@ -0,0 +1,131 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0types.h
+Transaction system global type definitions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "univ.i"
+#include "ut0new.h"
+
+#include <vector>
+
+/** printf(3) format used for printing DB_TRX_ID and other system fields */
+#define TRX_ID_FMT	IB_ID_FMT
+
+/** maximum length that a formatted trx_t::id could take, not including
+the terminating NUL character. */
+static const ulint TRX_ID_MAX_LEN = 17;
+
+/** Space id of the transaction system page (the system tablespace) */
+static constexpr uint32_t TRX_SYS_SPACE= 0;
+
+/** Page number of the transaction system page */
+#define TRX_SYS_PAGE_NO		FSP_TRX_SYS_PAGE_NO
+
+/** Random value to check for corruption of trx_t */
+static const ulint TRX_MAGIC_N = 91118598;
+
+constexpr uint innodb_purge_threads_MAX= 32;
+constexpr uint innodb_purge_batch_size_MAX= 5000;
+
+/** Transaction states (trx_t::state) */
+enum trx_state_t {
+	TRX_STATE_NOT_STARTED,
+
+	TRX_STATE_ACTIVE,
+	/** XA PREPARE has been executed; only XA COMMIT or XA ROLLBACK
+	are possible */
+	TRX_STATE_PREPARED,
+	/** XA PREPARE transaction that was returned to ha_recover() */
+	TRX_STATE_PREPARED_RECOVERED,
+	TRX_STATE_COMMITTED_IN_MEMORY
+};
+
+/** Memory objects */
+/* @{ */
+/** Transaction */
+struct trx_t;
+/** The locks and state of an active transaction */
+struct trx_lock_t;
+/** Rollback segment */
+struct trx_rseg_t;
+/** Transaction undo log */
+struct trx_undo_t;
+/** Rollback command node in a query graph */
+struct roll_node_t;
+/** Commit command node in a query graph */
+struct commit_node_t;
+/** SAVEPOINT command node in a query graph */
+struct trx_named_savept_t;
+/* @} */
+
+/** Row identifier (DB_ROW_ID, DATA_ROW_ID) */
+typedef ib_id_t	row_id_t;
+/** Transaction identifier (DB_TRX_ID, DATA_TRX_ID) */
+typedef ib_id_t	trx_id_t;
+/** Rollback pointer (DB_ROLL_PTR, DATA_ROLL_PTR) */
+typedef ib_id_t	roll_ptr_t;
+/** Undo number */
+typedef ib_id_t	undo_no_t;
+
+/** Transaction savepoint */
+struct trx_savept_t{
+	undo_no_t	least_undo_no;	/*!< least undo number to undo */
+};
+
+/** File objects */
+/* @{ */
+/** Undo segment header */
+typedef byte	trx_usegf_t;
+/** Undo log header */
+typedef byte	trx_ulogf_t;
+/** Undo log page header */
+typedef byte	trx_upagef_t;
+
+/** Undo log record */
+typedef	byte	trx_undo_rec_t;
+
+/* @} */
+
+/** Info required to purge a record */
+struct trx_purge_rec_t
+{
+  /** Undo log record, or nullptr (roll_ptr!=0 if the log can be skipped) */
+  const trx_undo_rec_t *undo_rec;
+  /** File pointer to undo_rec */
+  roll_ptr_t roll_ptr;
+};
+
+typedef std::vector<trx_id_t, ut_allocator<trx_id_t> >	trx_ids_t;
+
+/** Number of std::unordered_map hash buckets expected to be needed
+for table IDs in a purge batch. GNU libstdc++ would default to 1 and
+enlarge and rehash on demand. */
+static constexpr size_t TRX_PURGE_TABLE_BUCKETS= 128;
+
+/** The number of rollback segments; rollback segment id must fit in
+the 7 bits reserved for it in DB_ROLL_PTR. */
+static constexpr unsigned TRX_SYS_N_RSEGS= 128;
+/** Maximum number of undo tablespaces (not counting the system tablespace) */
+static constexpr unsigned TRX_SYS_MAX_UNDO_SPACES= TRX_SYS_N_RSEGS - 1;
diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h
new file mode 100644
index 00000000..3d22a33e
--- /dev/null
+++ b/storage/innobase/include/trx0undo.h
@@ -0,0 +1,514 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0undo.h
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0undo_h
+#define trx0undo_h
+
+#ifndef UNIV_INNOCHECKSUM
+#include "trx0sys.h"
+
+/** The LSB of the "is insert" flag in DB_ROLL_PTR */
+#define ROLL_PTR_INSERT_FLAG_POS 55
+/** The LSB of the 7-bit trx_rseg_t::id in DB_ROLL_PTR */
+#define ROLL_PTR_RSEG_ID_POS 48
+/** The LSB of the 32-bit undo log page number in DB_ROLL_PTR */
+#define ROLL_PTR_PAGE_POS 16
+/** The LSB of the 16-bit byte offset within an undo log page in DB_ROLL_PTR */
+#define ROLL_PTR_BYTE_POS 0
+
+/***********************************************************************//**
+Builds a roll pointer.
+@return roll pointer */
+UNIV_INLINE
+roll_ptr_t
+trx_undo_build_roll_ptr(
+/*====================*/
+	bool	is_insert,	/*!< in: TRUE if insert undo log */
+	ulint	rseg_id,	/*!< in: rollback segment id */
+	uint32_t page_no,	/*!< in: page number */
+	uint16_t offset);	/*!< in: offset of the undo entry within page */
+/***********************************************************************//**
+Decodes a roll pointer. */
+UNIV_INLINE
+void
+trx_undo_decode_roll_ptr(
+/*=====================*/
+	roll_ptr_t	roll_ptr,	/*!< in: roll pointer */
+	bool*		is_insert,	/*!< out: TRUE if insert undo log */
+	ulint*		rseg_id,	/*!< out: rollback segment id */
+	uint32_t*	page_no,	/*!< out: page number */
+	uint16_t*	offset);	/*!< out: offset of the undo
+					entry within page */
+/***********************************************************************//**
+Determine if DB_ROLL_PTR is of the insert type.
+@return true if insert */
+UNIV_INLINE
+bool
+trx_undo_roll_ptr_is_insert(
+/*========================*/
+	roll_ptr_t	roll_ptr);	/*!< in: roll pointer */
+/***********************************************************************//**
+Returns true if the record is of the insert type.
+@return true if the record was freshly inserted (not updated). */
+UNIV_INLINE
+bool
+trx_undo_trx_id_is_insert(
+/*======================*/
+	const byte*	trx_id)	/*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */
+	MY_ATTRIBUTE((warn_unused_result));
+/** Write DB_ROLL_PTR.
+@param[out]	ptr		buffer
+@param[in]	roll_ptr	DB_ROLL_PTR value */
+inline void trx_write_roll_ptr(byte* ptr, roll_ptr_t roll_ptr)
+{
+	compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+	mach_write_to_7(ptr, roll_ptr);
+}
+/** Read DB_ROLL_PTR.
+@param[in]	ptr	buffer
+@return roll ptr */
+inline roll_ptr_t trx_read_roll_ptr(const byte* ptr)
+{
+	compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+	return mach_read_from_7(ptr);
+}
+
+/** Get the next record in an undo log.
+@param[in]      undo_page       undo log page
+@param[in]      rec             undo record offset in the page
+@param[in]      page_no         undo log header page number
+@param[in]      offset          undo log header offset on page
+@return undo log record, the page latched, NULL if none */
+inline trx_undo_rec_t*
+trx_undo_page_get_next_rec(const buf_block_t *undo_page, uint16_t rec,
+                           uint32_t page_no, uint16_t offset);
+/** Get the previous record in an undo log.
+@param[in,out]  block   undo log page
+@param[in]      rec     undo record offset in the page
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in]      shared  latching mode: true=RW_S_LATCH, false=RW_X_LATCH
+@param[in,out]  mtr     mini-transaction
+@return undo log record, the page latched, NULL if none */
+trx_undo_rec_t*
+trx_undo_get_prev_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
+                      uint16_t offset, bool shared, mtr_t *mtr);
+
+/** Get the first undo log record on a page.
+@param[in]	block	undo log page
+@param[in]	page_no	undo log header page number
+@param[in]	offset	undo log header page offset
+@return	pointer to first record
+@retval	nullptr	if none exists */
+trx_undo_rec_t*
+trx_undo_page_get_first_rec(const buf_block_t *block, uint32_t page_no,
+                            uint16_t offset);
+
+/** Initialize an undo log page.
+NOTE: This corresponds to a redo log record and must not be changed!
+@see mtr_t::undo_create()
+@param[in,out]	block	undo log page */
+void trx_undo_page_init(const buf_block_t &block);
+
+/** Allocate an undo log page.
+@param[in,out]	undo	undo log
+@param[in,out]	mtr	mini-transaction that does not hold any page latch
+@param[out]	err	error code
+@return	X-latched block if success
+@retval	nullptr	on failure */
+buf_block_t *trx_undo_add_page(trx_undo_t *undo, mtr_t *mtr, dberr_t *err)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Free the last undo log page. The caller must hold the rseg mutex.
+@param[in,out]	undo	undo log
+@param[in,out]	mtr	mini-transaction that does not hold any undo log page
+			or that has allocated the undo log page
+@return error code */
+dberr_t trx_undo_free_last_page(trx_undo_t *undo, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Try to truncate the undo logs.
+@param trx transaction
+@return error code */
+dberr_t trx_undo_try_truncate(const trx_t &trx);
+
+/** Truncate the head of an undo log.
+NOTE that only whole pages are freed; the header page is not
+freed, but emptied, if all the records there are below the limit.
+@param[in,out]	rseg		rollback segment
+@param[in]	hdr_page_no	header page number
+@param[in]	hdr_offset	header offset on the page
+@param[in]	limit		first undo number to preserve
+(everything below the limit will be truncated)
+@return error code */
+dberr_t
+trx_undo_truncate_start(
+	trx_rseg_t*	rseg,
+	uint32_t	hdr_page_no,
+	uint16_t	hdr_offset,
+	undo_no_t	limit)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Mark that an undo log header belongs to a data dictionary transaction.
+@param[in]	trx	dictionary transaction
+@param[in,out]	undo	undo log
+@param[in,out]	mtr	mini-transaction */
+void trx_undo_mark_as_dict(const trx_t* trx, trx_undo_t* undo, mtr_t* mtr);
+/** Assign an undo log for a persistent transaction.
+A new undo log is created or a cached undo log reused.
+@param[in,out]	trx	transaction
+@param[out]	err	error code
+@param[in,out]	mtr	mini-transaction
+@return	the undo log block
+@retval	NULL	on error */
+buf_block_t*
+trx_undo_assign(trx_t* trx, dberr_t* err, mtr_t* mtr)
+	MY_ATTRIBUTE((nonnull));
+/** Assign an undo log for a transaction.
+A new undo log is created or a cached undo log reused.
+@tparam is_temp  whether this is temporary undo log
+@param[in,out]	trx	transaction
+@param[in]	rseg	rollback segment
+@param[out]	undo	the undo log
+@param[in,out]	mtr	mini-transaction
+@param[out]	err	error code
+@return	the undo log block
+@retval	nullptr	on error */
+template<bool is_temp>
+buf_block_t*
+trx_undo_assign_low(trx_t *trx, trx_rseg_t *rseg, trx_undo_t **undo,
+                    mtr_t *mtr, dberr_t *err)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Set the state of the undo log segment at a XA PREPARE or XA ROLLBACK.
+@param[in,out]	trx		transaction
+@param[in,out]	undo		undo log
+@param[in]	rollback	false=XA PREPARE, true=XA ROLLBACK
+@param[in,out]	mtr		mini-transaction */
+void trx_undo_set_state_at_prepare(trx_t *trx, trx_undo_t *undo, bool rollback,
+                                   mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
+
+/** At shutdown, frees the undo logs of a transaction. */
+void
+trx_undo_free_at_shutdown(trx_t *trx);
+
+/** Read an undo log when starting up the database.
+@param[in,out]	rseg		rollback segment
+@param[in]	id		rollback segment slot
+@param[in]	page_no		undo log segment page number
+@return	the undo log
+@retval nullptr on error */
+trx_undo_t *
+trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no);
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** the only rollback segment type since MariaDB 10.3.1 */
+constexpr uint16_t TRX_UNDO_UPDATE= 2;
+/* TRX_UNDO_STATE values of an undo log segment */
+/** contains an undo log of an active transaction */
+constexpr uint16_t TRX_UNDO_ACTIVE = 1;
+/** cached for quick reuse */
+constexpr uint16_t TRX_UNDO_CACHED = 2;
+/** can be freed in purge when all undo data in it is removed */
+constexpr uint16_t TRX_UNDO_TO_PURGE = 4;
+/** contains an undo log of a prepared transaction */
+constexpr uint16_t TRX_UNDO_PREPARED = 5;
+
+#ifndef UNIV_INNOCHECKSUM
+
+/** Transaction undo log memory object; modified by the thread associated
+with the transaction. */
+
+struct trx_undo_t {
+	/*-----------------------------*/
+	ulint		id;		/*!< undo log slot number within the
+					rollback segment */
+	ulint		state;		/*!< state of the corresponding undo log
+					segment */
+	trx_id_t	trx_id;		/*!< id of the trx assigned to the undo
+					log */
+	XID		xid;		/*!< X/Open XA transaction
+					identification */
+	bool		dict_operation;	/*!< TRUE if a dict operation trx */
+	trx_rseg_t*	rseg;		/*!< rseg where the undo log belongs */
+	/*-----------------------------*/
+	uint32_t	hdr_page_no;	/*!< page number of the header page in
+					the undo log */
+	uint32_t	last_page_no;	/*!< page number of the last page in the
+					undo log; this may differ from
+					top_page_no during a rollback */
+	uint16_t	hdr_offset;	/*!< header offset of the undo log on
+				       	the page */
+	uint32_t	size;		/*!< current size in pages */
+	/*-----------------------------*/
+	uint32_t	top_page_no;	/*!< page number where the latest undo
+					log record was catenated; during
+					rollback the page from which the latest
+					undo record was chosen */
+	uint16_t	top_offset;	/*!< offset of the latest undo record,
+					i.e., the topmost element in the undo
+					log if we think of it as a stack */
+	undo_no_t	top_undo_no;	/*!< undo number of the latest record
+					(IB_ID_MAX if the undo log is empty) */
+	buf_block_t*	guess_block;	/*!< guess for the buffer block where
+					the top page might reside */
+
+	/** @return whether the undo log is empty */
+	bool empty() const { return top_undo_no == IB_ID_MAX; }
+
+	/*-----------------------------*/
+	UT_LIST_NODE_T(trx_undo_t) undo_list;
+					/*!< undo log objects in the rollback
+					segment are chained into lists */
+};
+
+/** Cache a pointer to an undo record in a latched buffer pool page,
+parse the undo log record and store the record type, update vector
+and compiler information */
+class UndorecApplier
+{
+  /** Undo log block page id */
+  page_id_t page_id;
+  /** Pointer to within undo log record */
+  const trx_undo_rec_t *undo_rec;
+  /** Undo log record type */
+  byte type;
+  /** compiler information */
+  byte cmpl_info;
+  /** page_offset(undo_rec) of the start of undo_rec */
+  uint16_t offset;
+  /** Transaction id of the undo log */
+  const trx_id_t trx_id;
+  /** Update vector */
+  upd_t *update;
+  /** memory heap which can be used to build previous version of
+  the index record and its offsets */
+  mem_heap_t *heap;
+  /** mini-transaction for accessing B-tree pages */
+  mtr_t mtr;
+
+public:
+  UndorecApplier(page_id_t page_id, trx_id_t trx_id) :
+    page_id(page_id), trx_id(trx_id), heap(mem_heap_create(100))
+  {
+  }
+
+  /** Assign the next page id */
+  void assign_next(const page_id_t next_page_id)
+  {
+    page_id= next_page_id;
+  }
+
+  page_id_t get_page_id() const { return page_id; }
+
+  /** Handle the DML undo log and apply it on online indexes */
+  inline void apply_undo_rec(const trx_undo_rec_t *rec);
+
+  ~UndorecApplier()
+  {
+    mem_heap_free(heap);
+  }
+
+private:
+  /** Handle the insert undo log and apply it on online indexes
+  @param  tuple		row reference from undo log record
+  @param  clust_index	clustered index */
+  void log_insert(const dtuple_t &tuple, dict_index_t *clust_index);
+
+  /** Handle the update, delete undo log and apply it on online
+  indexes.
+  @param  tuple		row reference from undo log record
+  @param  clust_index	clustered index */
+  void log_update(const dtuple_t &tuple, dict_index_t *clust_index);
+
+  /** Check whether the given roll pointer is generated by
+  the current undo log record information stored.
+  @return true if roll pointer matches with current undo log info */
+  inline bool is_same(roll_ptr_t roll_ptr) const;
+
+  /** Clear the undo log record information */
+  void clear_undo_rec()
+  {
+    undo_rec= nullptr;
+    cmpl_info= 0;
+    type= 0;
+    update= nullptr;
+    mem_heap_empty(heap);
+  }
+
+  /** Get the correct version of the clustered index record that
+  was modified by the current undo log record. Because there could
+  be the multiple successive updates of the same record within the
+  same transaction.
+  @param	tuple		tuple contains primary key value
+  @param	index		clustered index
+  @param[out]	clust_rec	current clustered index record
+  @param	offsets		offsets points to the record
+  @return clustered index record which was changed by
+  the undo log record or nullptr when there is no clustered
+  index record changed by undo log record */
+  const rec_t* get_old_rec(const dtuple_t &tuple, dict_index_t *index,
+                           const rec_t **clust_rec, rec_offs **offsets);
+};
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** The offset of the undo log page header on pages of the undo log */
+#define	TRX_UNDO_PAGE_HDR	FSEG_PAGE_DATA
+/*-------------------------------------------------------------*/
+/** Transaction undo log page header offsets */
+/* @{ */
+#define	TRX_UNDO_PAGE_TYPE	0	/*!< unused; 0 (before MariaDB 10.3.1:
+					1=TRX_UNDO_INSERT or
+					2=TRX_UNDO_UPDATE) */
+#define	TRX_UNDO_PAGE_START	2	/*!< Byte offset where the undo log
+					records for the LATEST transaction
+					start on this page (remember that
+					in an update undo log, the first page
+					can contain several undo logs) */
+#define	TRX_UNDO_PAGE_FREE	4	/*!< On each page of the undo log this
+					field contains the byte offset of the
+					first free byte on the page */
+#define TRX_UNDO_PAGE_NODE	6	/*!< The file list node in the chain
+					of undo log pages */
+/*-------------------------------------------------------------*/
+#define TRX_UNDO_PAGE_HDR_SIZE	(6 + FLST_NODE_SIZE)
+					/*!< Size of the transaction undo
+					log page header, in bytes */
+/* @} */
+
+/** An update undo segment with just one page can be reused if it has
+at most this many bytes used; we must leave space at least for one new undo
+log header on the page */
+
+#define TRX_UNDO_PAGE_REUSE_LIMIT	(3 << (srv_page_size_shift - 2))
+
+/* An update undo log segment may contain several undo logs on its first page
+if the undo logs took so little space that the segment could be cached and
+reused. All the undo log headers are then on the first page, and the last one
+owns the undo log records on subsequent pages if the segment is bigger than
+one page. If an undo log is stored in a segment, then on the first page it is
+allowed to have zero undo records, but if the segment extends to several
+pages, then all the rest of the pages must contain at least one undo log
+record. */
+
+/** The offset of the undo log segment header on the first page of the undo
+log segment */
+
+#define	TRX_UNDO_SEG_HDR	(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE)
+/** Undo log segment header */
+/* @{ */
+/*-------------------------------------------------------------*/
+#define	TRX_UNDO_STATE		0	/*!< TRX_UNDO_ACTIVE, ... */
+
+#ifndef UNIV_INNOCHECKSUM
+
+#define	TRX_UNDO_LAST_LOG	2	/*!< Offset of the last undo log header
+					on the segment header page, 0 if
+					none */
+#define	TRX_UNDO_FSEG_HEADER	4	/*!< Header for the file segment which
+					the undo log segment occupies */
+#define	TRX_UNDO_PAGE_LIST	(4 + FSEG_HEADER_SIZE)
+					/*!< Base node for the list of pages in
+					the undo log segment; defined only on
+					the undo log segment's first page */
+/*-------------------------------------------------------------*/
+/** Size of the undo log segment header */
+#define TRX_UNDO_SEG_HDR_SIZE	(4 + FSEG_HEADER_SIZE + FLST_BASE_NODE_SIZE)
+/* @} */
+
+/** The undo log header. There can be several undo log headers on the first
+page of an update undo log segment. */
+/* @{ */
+/*-------------------------------------------------------------*/
+/** Transaction start identifier, or 0 if the undo log segment has been
+completely purged and trx_purge_free_segment() has started freeing it */
+#define	TRX_UNDO_TRX_ID		0
+/** Transaction end identifier (if the log is in a history list),
+or 0 if the transaction has not been committed */
+#define	TRX_UNDO_TRX_NO		8
+/** Before MariaDB 10.3.1, when purge did not reset DB_TRX_ID of
+surviving user records, this used to be called TRX_UNDO_DEL_MARKS.
+
+This field is redundant; it is only being read by some debug assertions.
+
+The value 1 indicates that purge needs to process the undo log segment.
+The value 0 indicates that all of it has been processed, and
+trx_purge_free_segment() has been invoked, so the log is not safe to access.
+
+Before MariaDB 10.3.1, a log segment may carry the value 0 even before
+trx_purge_free_segment() was called, for those undo log records for
+which purge would not result in removing delete-marked records. */
+#define	TRX_UNDO_NEEDS_PURGE	16
+#define	TRX_UNDO_LOG_START	18	/*!< Offset of the first undo log record
+					of this log on the header page; purge
+					may remove undo log record from the
+					log start, and therefore this is not
+					necessarily the same as this log
+					header end offset */
+#define	TRX_UNDO_XID_EXISTS	20	/*!< TRUE if undo log header includes
+					X/Open XA transaction identification
+					XID */
+#define	TRX_UNDO_DICT_TRANS	21	/*!< TRUE if the transaction is a table
+					create, index create, or drop
+					transaction: in recovery
+					the transaction cannot be rolled back
+					in the usual way: a 'rollback' rather
+					means dropping the created or dropped
+					table, if it still exists */
+#define TRX_UNDO_TABLE_ID	22	/*!< Id of the table if the preceding
+					field is TRUE */
+#define	TRX_UNDO_NEXT_LOG	30	/*!< Offset of the next undo log header
+					on this page, 0 if none */
+#define	TRX_UNDO_PREV_LOG	32	/*!< Offset of the previous undo log
+					header on this page, 0 if none */
+#define TRX_UNDO_HISTORY_NODE	34	/*!< If the log is put to the history
+					list, the file list node is here */
+/*-------------------------------------------------------------*/
+/** Size of the undo log header without XID information */
+#define TRX_UNDO_LOG_OLD_HDR_SIZE (34 + FLST_NODE_SIZE)
+
+/** X/Open XA Transaction Identification (XID) */
+/* @{ */
+/** xid_t::formatID */
+#define	TRX_UNDO_XA_FORMAT	(TRX_UNDO_LOG_OLD_HDR_SIZE)
+/** xid_t::gtrid_length */
+#define	TRX_UNDO_XA_TRID_LEN	(TRX_UNDO_XA_FORMAT + 4)
+/** xid_t::bqual_length */
+#define	TRX_UNDO_XA_BQUAL_LEN	(TRX_UNDO_XA_TRID_LEN + 4)
+/** Distributed transaction identifier data */
+#define	TRX_UNDO_XA_XID		(TRX_UNDO_XA_BQUAL_LEN + 4)
+/*--------------------------------------------------------------*/
+#define TRX_UNDO_LOG_XA_HDR_SIZE (TRX_UNDO_XA_XID + XIDDATASIZE)
+					/*!< Total size of the undo log header
+					with the XA XID */
+/* @} */
+
+#include "trx0undo.inl"
+#endif /* !UNIV_INNOCHECKSUM */
+
+#endif
diff --git a/storage/innobase/include/trx0undo.inl b/storage/innobase/include/trx0undo.inl
new file mode 100644
index 00000000..9f05989f
--- /dev/null
+++ b/storage/innobase/include/trx0undo.inl
@@ -0,0 +1,129 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0undo.ic
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "data0type.h"
+#include "page0page.h"
+
+/***********************************************************************//**
+Builds a roll pointer.
+@return roll pointer */
+UNIV_INLINE
+roll_ptr_t
+trx_undo_build_roll_ptr(
+/*====================*/
+	bool	is_insert,	/*!< in: TRUE if insert undo log */
+	ulint	rseg_id,	/*!< in: rollback segment id */
+	uint32_t page_no,	/*!< in: page number */
+	uint16_t offset)		/*!< in: offset of the undo entry within page */
+{
+  compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+  ut_ad(rseg_id < TRX_SYS_N_RSEGS);
+
+  return roll_ptr_t{is_insert} << ROLL_PTR_INSERT_FLAG_POS |
+    roll_ptr_t{rseg_id} << ROLL_PTR_RSEG_ID_POS |
+    roll_ptr_t{page_no} << ROLL_PTR_PAGE_POS | offset;
+}
+
+/***********************************************************************//**
+Decodes a roll pointer. */
+UNIV_INLINE
+void
+trx_undo_decode_roll_ptr(
+/*=====================*/
+	roll_ptr_t	roll_ptr,	/*!< in: roll pointer */
+	bool*		is_insert,	/*!< out: TRUE if insert undo log */
+	ulint*		rseg_id,	/*!< out: rollback segment id */
+	uint32_t*	page_no,	/*!< out: page number */
+	uint16_t*	offset)		/*!< out: offset of the undo
+					entry within page */
+{
+  compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+  ut_ad(roll_ptr < (1ULL << 56));
+  *offset= static_cast<uint16_t>(roll_ptr);
+  *page_no= static_cast<uint32_t>(roll_ptr >> 16);
+  *rseg_id= static_cast<ulint>(roll_ptr >> 48 & 0x7F);
+  *is_insert= static_cast<bool>(roll_ptr >> 55);
+}
+
+/***********************************************************************//**
+Determine if DB_ROLL_PTR is of the insert type.
+@return true if insert */
+UNIV_INLINE
+bool
+trx_undo_roll_ptr_is_insert(
+/*========================*/
+	roll_ptr_t	roll_ptr)	/*!< in: roll pointer */
+{
+	compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+	ut_ad(roll_ptr < (1ULL << (ROLL_PTR_INSERT_FLAG_POS + 1)));
+	return static_cast<bool>(roll_ptr >> ROLL_PTR_INSERT_FLAG_POS);
+}
+
+/***********************************************************************//**
+Returns true if the record is of the insert type.
+@return true if the record was freshly inserted (not updated). */
+UNIV_INLINE
+bool
+trx_undo_trx_id_is_insert(
+/*======================*/
+	const byte*	trx_id)	/*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */
+{
+	compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
+	return bool(trx_id[DATA_TRX_ID_LEN] >> 7);
+}
+
+/** Determine the end offset of undo log records of an undo log page.
+@param[in]	undo_page	undo log page
+@param[in]	page_no		undo log header page number
+@param[in]	offset		undo log header offset
+@return end offset */
+inline
+uint16_t trx_undo_page_get_end(const buf_block_t *undo_page, uint32_t page_no,
+                               uint16_t offset)
+{
+  if (page_no == undo_page->page.id().page_no())
+    if (uint16_t end = mach_read_from_2(TRX_UNDO_NEXT_LOG + offset +
+					undo_page->page.frame))
+      return end;
+
+  return mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE +
+			  undo_page->page.frame);
+}
+
+/** Get the next record in an undo log.
+@param[in]      undo_page       undo log page
+@param[in]      rec             undo record offset in the page
+@param[in]      page_no         undo log header page number
+@param[in]      offset          undo log header offset on page
+@return undo log record, the page latched, NULL if none */
+inline trx_undo_rec_t*
+trx_undo_page_get_next_rec(const buf_block_t *undo_page, uint16_t rec,
+                           uint32_t page_no, uint16_t offset)
+{
+  uint16_t end= trx_undo_page_get_end(undo_page, page_no, offset);
+  uint16_t next= mach_read_from_2(undo_page->page.frame + rec);
+  return next == end ? nullptr : undo_page->page.frame + next;
+}
diff --git a/storage/innobase/include/trx0xa.h b/storage/innobase/include/trx0xa.h
new file mode 100644
index 00000000..cb5d67cf
--- /dev/null
+++ b/storage/innobase/include/trx0xa.h
@@ -0,0 +1,61 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*
+ * Start of xa.h header
+ *
+ * Define a symbol to prevent multiple inclusions of this header file
+ */
+#ifndef	XA_H
+#define	XA_H
+
+#include "handler.h"
+
+/*
+ * Transaction branch identification: XID and NULLXID:
+ */
+#ifndef XIDDATASIZE
+
+/** Sizes of transaction identifier */
+#define	XIDDATASIZE	128		/*!< maximum size of a transaction
+					identifier, in bytes */
+#define	MAXGTRIDSIZE	 64		/*!< maximum size in bytes of gtrid */
+#define	MAXBQUALSIZE	 64		/*!< maximum size in bytes of bqual */
+
+#endif
+/** X/Open XA distributed transaction status codes */
+/* @{ */
+#define	XA_OK		0		/*!< normal execution */
+#define	XAER_ASYNC	-2		/*!< asynchronous operation already
+					outstanding */
+#define	XAER_RMERR	-3		/*!< a resource manager error
+					occurred in the transaction
+					branch */
+#define	XAER_NOTA	-4		/*!< the XID is not valid */
+#define	XAER_INVAL	-5		/*!< invalid arguments were given */
+#define	XAER_PROTO	-6		/*!< routine invoked in an improper
+					context */
+#define	XAER_RMFAIL	-7		/*!< resource manager unavailable */
+#define	XAER_DUPID	-8		/*!< the XID already exists */
+#define	XAER_OUTSIDE	-9		/*!< resource manager doing
+					work outside transaction */
+/* @} */
+#endif /* ifndef XA_H */
+/*
+ * End of xa.h header
+ */
diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i
new file mode 100644
index 00000000..1b4f70b6
--- /dev/null
+++ b/storage/innobase/include/univ.i
@@ -0,0 +1,503 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/***********************************************************************//**
+@file include/univ.i
+Version control for database, common definitions, and include files
+
+Created 1/20/1994 Heikki Tuuri
+****************************************************************************/
+
+#pragma once
+
+/** How far ahead should we tell the service manager the timeout
+(time in seconds) */
+#define INNODB_EXTEND_TIMEOUT_INTERVAL 30
+
+#if defined(_WIN32)
+# include <windows.h>
+#endif /* _WIN32 */
+
+/* Include a minimum number of SQL header files so that few changes
+made in SQL code cause a complete InnoDB rebuild.  These headers are
+used throughout InnoDB but do not include too much themselves.  They
+support cross-platform development and expose comonly used SQL names. */
+
+#include <my_global.h>
+#include "my_counter.h"
+#include "aligned.h"
+#include <m_string.h>
+#include <mysqld_error.h>
+
+/* Include <sys/stat.h> to get S_I... macros defined for os0file.cc */
+#include <sys/stat.h>
+
+#ifndef _WIN32
+# include <sched.h>
+# include "my_config.h"
+#endif
+
+#include <stdint.h>
+#include <inttypes.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#include "my_pthread.h"
+
+/* Following defines are to enable performance schema
+instrumentation in each of five InnoDB modules if
+HAVE_PSI_INTERFACE is defined. */
+#ifdef HAVE_PSI_INTERFACE
+# define UNIV_PFS_MUTEX
+# define UNIV_PFS_RWLOCK
+# define UNIV_PFS_IO
+# define UNIV_PFS_THREAD
+
+# include "mysql/psi/psi.h" /* HAVE_PSI_MEMORY_INTERFACE */
+# ifdef HAVE_PSI_MEMORY_INTERFACE
+#  define UNIV_PFS_MEMORY
+# endif /* HAVE_PSI_MEMORY_INTERFACE */
+
+#ifdef HAVE_PFS_THREAD_PROVIDER_H
+/* For PSI_MUTEX_CALL() and similar. */
+#include "pfs_thread_provider.h"
+#endif
+
+#include "mysql/psi/mysql_thread.h"
+/* For PSI_FILE_CALL(). */
+#ifdef HAVE_PFS_FILE_PROVIDER_H
+#include "pfs_file_provider.h"
+#endif
+
+#include "mysql/psi/mysql_file.h"
+
+#endif /* HAVE_PSI_INTERFACE */
+
+#ifdef _WIN32
+# define YY_NO_UNISTD_H 1
+/* VC++ tries to optimise for size by default, from V8+. The size of
+the pointer to member depends on whether the type is defined before the
+compiler sees the type in the translation unit. This default behaviour
+can cause the pointer to be a different size in different translation
+units, depending on the above rule. We force optimise for size behaviour
+for all cases. This is used by ut0lst.h related code. */
+# pragma pointers_to_members(full_generality, multiple_inheritance)
+#endif /* _WIN32 */
+
+/*			DEBUG VERSION CONTROL
+			===================== */
+
+/* When this macro is defined then additional test functions will be
+compiled. These functions live at the end of each relevant source file
+and have "test_" prefix. These functions can be called from the end of
+innodb_init() or they can be called from gdb after srv_start() has executed
+using the call command. */
+/*
+#define UNIV_COMPILE_TEST_FUNCS
+#define UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
+#define UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH
+#define UNIV_ENABLE_UNIT_TEST_DICT_STATS
+#define UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT
+*/
+
+#ifdef DBUG_OFF
+# undef UNIV_DEBUG
+#elif !defined UNIV_DEBUG
+# define UNIV_DEBUG
+#endif
+
+#if 0
+#define UNIV_DEBUG_PRINT			/* Enable the compilation of
+						some debug print functions */
+#define UNIV_AHI_DEBUG				/* Enable adaptive hash index
+						debugging without UNIV_DEBUG */
+#define UNIV_BLOB_LIGHT_DEBUG			/* Enable off-page column
+						debugging without UNIV_DEBUG */
+#define UNIV_DEBUG_LOCK_VALIDATE		/* Enable
+						ut_ad(lock_rec_validate_page())
+						assertions. */
+#define UNIV_LRU_DEBUG				/* debug the buffer pool LRU */
+#define UNIV_HASH_DEBUG				/* debug HASH_ macros */
+#define UNIV_IBUF_DEBUG				/* debug the insert buffer */
+#define UNIV_PERF_DEBUG                         /* debug flag that enables
+                                                light weight performance
+                                                related stuff. */
+#define UNIV_SEARCH_PERF_STAT			/* statistics for the
+						adaptive hash index */
+#define UNIV_BTR_PRINT				/* enable functions for
+						printing B-trees */
+#define UNIV_ZIP_DEBUG				/* extensive consistency checks
+						for compressed pages */
+#define UNIV_ZIP_COPY				/* call page_zip_copy_recs()
+						more often */
+#define UNIV_AIO_DEBUG				/* prints info about
+						submitted and reaped AIO
+						requests to the log. */
+#define UNIV_STATS_DEBUG			/* prints various stats
+						related debug info from
+						dict0stats.c */
+#define FTS_INTERNAL_DIAG_PRINT                 /* FTS internal debugging
+                                                info output */
+#endif
+
+// #define UNIV_SQL_DEBUG
+
+#ifndef MY_ATTRIBUTE
+#if defined(__GNUC__)
+#  define MY_ATTRIBUTE(A) __attribute__(A)
+#else
+#  define MY_ATTRIBUTE(A)
+#endif
+#endif
+
+#define UNIV_INLINE static inline
+
+#define UNIV_WORD_SIZE		SIZEOF_SIZE_T
+
+/** The following alignment is used in memory allocations in memory heap
+management to ensure correct alignment for doubles etc. */
+#define UNIV_MEM_ALIGNMENT	8U
+
+/*
+			DATABASE VERSION CONTROL
+			========================
+*/
+
+#if defined (HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE) || defined(_WIN32)
+#define IF_PUNCH_HOLE(A,B) A
+#else
+#define IF_PUNCH_HOLE(A,B) B
+#endif
+
+/** log2 of smallest compressed page size (1<<10 == 1024 bytes)
+Note: This must never change! */
+#define UNIV_ZIP_SIZE_SHIFT_MIN		10U
+
+/** log2 of largest compressed page size (1<<14 == 16384 bytes).
+A compressed page directory entry reserves 14 bits for the start offset
+and 2 bits for flags. This limits the uncompressed page size to 16k.
+*/
+#define UNIV_ZIP_SIZE_SHIFT_MAX		14U
+
+/* Define the Min, Max, Default page sizes. */
+/** Minimum Page Size Shift (power of 2) */
+#define UNIV_PAGE_SIZE_SHIFT_MIN	12U
+/** log2 of largest page size (1<<16 == 64436 bytes). */
+/** Maximum Page Size Shift (power of 2) */
+#define UNIV_PAGE_SIZE_SHIFT_MAX	16U
+/** log2 of default page size (1<<14 == 16384 bytes). */
+/** Default Page Size Shift (power of 2) */
+#define UNIV_PAGE_SIZE_SHIFT_DEF	14U
+/** Original 16k InnoDB Page Size Shift, in case the default changes */
+#define UNIV_PAGE_SIZE_SHIFT_ORIG	14U
+/** Original 16k InnoDB Page Size as an ssize (log2 - 9) */
+#define UNIV_PAGE_SSIZE_ORIG		(UNIV_PAGE_SIZE_SHIFT_ORIG - 9U)
+
+/** Minimum page size InnoDB currently supports. */
+#define UNIV_PAGE_SIZE_MIN	(1U << UNIV_PAGE_SIZE_SHIFT_MIN)
+/** Maximum page size InnoDB currently supports. */
+#define UNIV_PAGE_SIZE_MAX	(1U << UNIV_PAGE_SIZE_SHIFT_MAX)
+/** Default page size for InnoDB tablespaces. */
+#define UNIV_PAGE_SIZE_DEF	(1U << UNIV_PAGE_SIZE_SHIFT_DEF)
+/** Original 16k page size for InnoDB tablespaces. */
+#define UNIV_PAGE_SIZE_ORIG	(1U << UNIV_PAGE_SIZE_SHIFT_ORIG)
+
+/** Smallest compressed page size */
+#define UNIV_ZIP_SIZE_MIN	(1U << UNIV_ZIP_SIZE_SHIFT_MIN)
+
+/** Largest compressed page size */
+#define UNIV_ZIP_SIZE_MAX	(1U << UNIV_ZIP_SIZE_SHIFT_MAX)
+
+/** Largest possible ssize for an uncompressed page.
+(The convention 'ssize' is used for 'log2 minus 9' or the number of
+shifts starting with 512.)
+This max number varies depending on srv_page_size. */
+#define UNIV_PAGE_SSIZE_MAX	\
+	ulint(srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1U)
+
+/** Smallest possible ssize for an uncompressed page. */
+#define UNIV_PAGE_SSIZE_MIN	\
+	ulint(UNIV_PAGE_SIZE_SHIFT_MIN - UNIV_ZIP_SIZE_SHIFT_MIN + 1U)
+
+/** Maximum number of parallel threads in a parallelized operation */
+#define UNIV_MAX_PARALLELISM	32
+
+/** This is the "mbmaxlen" for my_charset_filename (defined in
+strings/ctype-utf8.c), which is used to encode File and Database names. */
+#define FILENAME_CHARSET_MAXNAMLEN	5
+
+/** The maximum length of an encode table name in bytes.  The max
+table and database names are NAME_CHAR_LEN (64) characters. After the
+encoding, the max length would be NAME_CHAR_LEN (64) *
+FILENAME_CHARSET_MAXNAMLEN (5) = 320 bytes. The number does not include a
+terminating '\0'. InnoDB can handle longer names internally */
+#define MAX_TABLE_NAME_LEN	320
+
+/** The maximum length of a database name. Like MAX_TABLE_NAME_LEN this is
+the MySQL's NAME_LEN, see check_and_convert_db_name(). */
+#define MAX_DATABASE_NAME_LEN	MAX_TABLE_NAME_LEN
+
+/** MAX_FULL_NAME_LEN defines the full name path including the
+database name and table name. In addition, 14 bytes is added for:
+	2 for surrounding quotes around table name
+	1 for the separating dot (.)
+	9 for the #mysql50# prefix */
+#define MAX_FULL_NAME_LEN				\
+	(MAX_TABLE_NAME_LEN + MAX_DATABASE_NAME_LEN + 14)
+
+/** Maximum length of the compression alogrithm string. Currently we support
+only (NONE | ZLIB | LZ4). */
+#define MAX_COMPRESSION_LEN     4
+
+/** The maximum length in bytes that a database name can occupy when stored in
+UTF8, including the terminating '\0', see dict_fs2utf8(). You must include
+mysql_com.h if you are to use this macro. */
+#define MAX_DB_UTF8_LEN		(NAME_LEN + 1)
+
+/** The maximum length in bytes that a table name can occupy when stored in
+UTF8, including the terminating '\0', see dict_fs2utf8(). You must include
+mysql_com.h if you are to use this macro. */
+#define MAX_TABLE_UTF8_LEN	(NAME_LEN + sizeof(srv_mysql50_table_name_prefix))
+
+/*
+			UNIVERSAL TYPE DEFINITIONS
+			==========================
+*/
+
+/** Unsigned octet of bits */
+typedef unsigned char byte;
+/** Machine-word-width unsigned integer */
+typedef size_t ulint;
+/** Machine-word-width signed integer */
+typedef ssize_t lint;
+
+/** ulint format for the printf() family of functions */
+#define ULINTPF "%zu"
+/** ulint hexadecimal format for the printf() family of functions */
+#define ULINTPFx "%zx"
+
+#ifdef _WIN32
+/* Use the integer types and formatting strings defined in Visual Studio. */
+# define UINT32PF	"%u"
+# define UINT64scan     "llu"
+# define UINT64PFx	"%016llx"
+#elif defined __APPLE__
+/* Apple prefers to call the 64-bit types 'long long'
+in both 32-bit and 64-bit environments. */
+# define UINT32PF	"%" PRIu32
+# define UINT64scan     "llu"
+# define UINT64PFx	"%016llx"
+#elif defined _AIX
+/* Workaround for macros expension trouble */
+# define UINT32PF      "%u"
+# define UINT64scan    "lu"
+# define UINT64PFx     "%016lx"
+#else
+/* Use the integer types and formatting strings defined in the C99 standard. */
+# define UINT32PF	"%" PRIu32
+# define INT64PF	"%" PRId64
+# define UINT64scan	PRIu64
+# define UINT64PFx	"%016" PRIx64
+#endif
+
+typedef int64_t ib_int64_t;
+typedef uint64_t ib_uint64_t;
+typedef uint32_t ib_uint32_t;
+
+#define UINT64PF	"%" UINT64scan
+#define IB_ID_FMT	UINT64PF
+
+/** Log sequence number (also used for redo log byte arithmetics) */
+typedef	ib_uint64_t		lsn_t;
+
+/** The 'undefined' value for a ulint */
+#define ULINT_UNDEFINED		((ulint)(-1))
+
+/** The 'undefined' value for a ib_uint64_t */
+#define UINT64_UNDEFINED	((ib_uint64_t)(-1))
+
+/** The bitmask of 32-bit unsigned integer */
+#define ULINT32_MASK		0xFFFFFFFFU
+/** The undefined 32-bit unsigned integer */
+#define	ULINT32_UNDEFINED	ULINT32_MASK
+
+/** Maximum value for a ulint */
+#define ULINT_MAX		((ulint)(-2))
+
+/** Maximum value for ib_uint64_t */
+#define IB_UINT64_MAX		((ib_uint64_t) (~0ULL))
+
+/** The generic InnoDB system object identifier data type */
+typedef ib_uint64_t	        ib_id_t;
+#define IB_ID_MAX               (~(ib_id_t) 0)
+#define IB_ID_FMT               UINT64PF
+
+#ifndef UINTMAX_MAX
+#define UINTMAX_MAX		IB_UINT64_MAX
+#endif
+/** This 'ibool' type is used within Innobase. Remember that different included
+headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */
+#define ibool			ulint
+
+#ifndef TRUE
+
+#define TRUE    1
+#define FALSE   0
+
+#endif
+
+#define UNIV_NOTHROW
+
+/** The following number as the length of a logical field means that the field
+has the SQL NULL as its value. NOTE that because we assume that the length
+of a field is a 32-bit integer when we store it, for example, to an undo log
+on disk, we must have also this number fit in 32 bits, also in 64-bit
+computers! */
+
+#define UNIV_SQL_NULL ULINT32_UNDEFINED
+
+/** Lengths which are not UNIV_SQL_NULL, but bigger than the following
+number indicate that a field contains a reference to an externally
+stored part of the field in the tablespace. The length field then
+contains the sum of the following flag and the locally stored len. */
+
+#define UNIV_EXTERN_STORAGE_FIELD (UNIV_SQL_NULL - UNIV_PAGE_SIZE_DEF)
+
+#if defined(__GNUC__)
+/* Tell the compiler that variable/function is unused. */
+# define UNIV_UNUSED    MY_ATTRIBUTE ((unused))
+#else
+# define UNIV_UNUSED
+#endif /* CHECK FOR GCC VER_GT_2 */
+
+/* Some macros to improve branch prediction and reduce cache misses */
+#ifdef __GNUC__
+/* Tell the compiler that 'expr' probably evaluates to 'constant'. */
+# define UNIV_EXPECT(expr,constant) __builtin_expect(expr, constant)
+/* Tell the compiler that a pointer is likely to be NULL */
+# define UNIV_LIKELY_NULL(ptr) __builtin_expect((ptr) != 0, 0)
+/* Minimize cache-miss latency by moving data at addr into a cache before
+it is read. */
+# define UNIV_PREFETCH_R(addr) __builtin_prefetch(addr, 0, 3)
+/* Minimize cache-miss latency by moving data at addr into a cache before
+it is read or written. */
+# define UNIV_PREFETCH_RW(addr) __builtin_prefetch(addr, 1, 3)
+
+/* Sun Studio includes sun_prefetch.h as of version 5.9 */
+#elif (defined(__SUNPRO_C) || defined(__SUNPRO_CC))
+
+# include <sun_prefetch.h>
+
+# define UNIV_EXPECT(expr,value) (expr)
+# define UNIV_LIKELY_NULL(expr) (expr)
+
+//# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many((void*) addr)
+# define UNIV_PREFETCH_R(addr) ((void) 0)
+# define UNIV_PREFETCH_RW(addr) sun_prefetch_write_many(addr)
+
+# elif defined _MSC_VER
+# define UNIV_EXPECT(expr,value) (expr)
+# define UNIV_LIKELY_NULL(expr) (expr)
+# if defined _M_IX86 || defined _M_X64
+   // __MM_HINT_T0 - (temporal data)
+   // prefetch data into all levels of the cache hierarchy.
+#  define UNIV_PREFETCH_R(addr) _mm_prefetch((char *) addr, _MM_HINT_T0)
+#  define UNIV_PREFETCH_RW(addr) _mm_prefetch((char *) addr, _MM_HINT_T0)
+# elif defined _M_ARM64
+#  define UNIV_PREFETCH_R(addr) __prefetch(addr)
+#  define UNIV_PREFETCH_RW(addr) __prefetch(addr)
+# else
+#  define UNIV_PREFETCH_R ((void) 0)
+#  define  UNIV_PREFETCH_RW(addr) ((void) 0)
+# endif
+#else
+/* Dummy versions of the macros */
+# define UNIV_EXPECT(expr,value) (expr)
+# define UNIV_LIKELY_NULL(expr) (expr)
+# define UNIV_PREFETCH_R(addr) ((void) 0)
+# define UNIV_PREFETCH_RW(addr) ((void) 0)
+#endif
+
+/* Tell the compiler that cond is likely to hold */
+#define UNIV_LIKELY(cond) UNIV_EXPECT(cond, TRUE)
+/* Tell the compiler that cond is unlikely to hold */
+#define UNIV_UNLIKELY(cond) UNIV_EXPECT(cond, FALSE)
+
+/* Compile-time constant of the given array's size. */
+#define UT_ARR_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+
+#include <stdio.h>
+#include "db0err.h"
+#include "ut0dbg.h"
+#include "ut0lst.h"
+#include "ut0ut.h"
+
+extern uint32_t srv_page_size_shift;
+extern ulong	srv_page_size;
+
+/* Dimension of spatial object we support so far. It has its root in
+myisam/sp_defs.h. We only support 2 dimension data */
+#define SPDIMS          2
+
+#ifdef HAVE_PSI_INTERFACE
+typedef unsigned int mysql_pfs_key_t;
+
+# ifdef UNIV_PFS_MUTEX
+extern mysql_pfs_key_t buf_pool_mutex_key;
+extern mysql_pfs_key_t dict_foreign_err_mutex_key;
+extern mysql_pfs_key_t fil_system_mutex_key;
+extern mysql_pfs_key_t flush_list_mutex_key;
+extern mysql_pfs_key_t fts_cache_mutex_key;
+extern mysql_pfs_key_t fts_cache_init_mutex_key;
+extern mysql_pfs_key_t fts_delete_mutex_key;
+extern mysql_pfs_key_t fts_doc_id_mutex_key;
+extern mysql_pfs_key_t ibuf_bitmap_mutex_key;
+extern mysql_pfs_key_t ibuf_mutex_key;
+extern mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key;
+extern mysql_pfs_key_t recalc_pool_mutex_key;
+extern mysql_pfs_key_t purge_sys_pq_mutex_key;
+extern mysql_pfs_key_t recv_sys_mutex_key;
+extern mysql_pfs_key_t rtr_active_mutex_key;
+extern mysql_pfs_key_t rtr_match_mutex_key;
+extern mysql_pfs_key_t rtr_path_mutex_key;
+extern mysql_pfs_key_t page_zip_stat_per_index_mutex_key;
+extern mysql_pfs_key_t srv_innodb_monitor_mutex_key;
+extern mysql_pfs_key_t srv_misc_tmpfile_mutex_key;
+extern mysql_pfs_key_t srv_monitor_file_mutex_key;
+extern mysql_pfs_key_t buf_dblwr_mutex_key;
+extern mysql_pfs_key_t trx_pool_mutex_key;
+extern mysql_pfs_key_t trx_pool_manager_mutex_key;
+extern mysql_pfs_key_t lock_wait_mutex_key;
+extern mysql_pfs_key_t srv_threads_mutex_key;
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_RWLOCK
+extern mysql_pfs_key_t dict_operation_lock_key;
+extern mysql_pfs_key_t fil_space_latch_key;
+extern mysql_pfs_key_t trx_i_s_cache_lock_key;
+extern mysql_pfs_key_t trx_purge_latch_key;
+extern mysql_pfs_key_t index_tree_rw_lock_key;
+extern mysql_pfs_key_t index_online_log_key;
+extern mysql_pfs_key_t trx_sys_rw_lock_key;
+extern mysql_pfs_key_t lock_latch_key;
+extern mysql_pfs_key_t log_latch_key;
+extern mysql_pfs_key_t trx_rseg_latch_key;
+# endif /* UNIV_PFS_RWLOCK */
+#endif /* HAVE_PSI_INTERFACE */
diff --git a/storage/innobase/include/ut0byte.h b/storage/innobase/include/ut0byte.h
new file mode 100644
index 00000000..2b70fac3
--- /dev/null
+++ b/storage/innobase/include/ut0byte.h
@@ -0,0 +1,107 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0byte.h
+Utilities for byte operations
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0byte_h
+#define ut0byte_h
+
+#include "univ.i"
+
+/*******************************************************//**
+Creates a 64-bit integer out of two 32-bit integers.
+@return created integer */
+UNIV_INLINE
+ib_uint64_t
+ut_ull_create(
+/*==========*/
+	ulint	high,	/*!< in: high-order 32 bits */
+	ulint	low)	/*!< in: low-order 32 bits */
+	MY_ATTRIBUTE((const));
+
+/********************************************************//**
+Rounds a 64-bit integer downward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_down(
+/*=================*/
+	ib_uint64_t	 n,		/*!< in: number to be rounded */
+	ulint		 align_no);	/*!< in: align by this number
+					which must be a power of 2 */
+/********************************************************//**
+Rounds ib_uint64_t upward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_up(
+/*===============*/
+	ib_uint64_t	 n,		/*!< in: number to be rounded */
+	ulint		 align_no);	/*!< in: align by this number
+					which must be a power of 2 */
+/** Round down a pointer to the nearest aligned address.
+@param ptr        pointer
+@param alignment  a power of 2
+@return aligned pointer */
+static inline void *ut_align_down(void *ptr, size_t alignment)
+{
+  ut_ad(alignment > 0);
+  ut_ad(ut_is_2pow(alignment));
+  ut_ad(ptr);
+  static_assert(sizeof ptr == sizeof(size_t), "compatibility");
+
+  return reinterpret_cast<void*>(reinterpret_cast<size_t>(ptr) &
+                                 ~(alignment - 1));
+}
+
+static inline const void *ut_align_down(const void *ptr, size_t alignment)
+{
+  return ut_align_down(const_cast<void*>(ptr), alignment);
+}
+
+/** Compute the offset of a pointer from the nearest aligned address.
+@param ptr        pointer
+@param alignment  a power of 2
+@return distance from aligned pointer */
+inline size_t ut_align_offset(const void *ptr, size_t alignment)
+{
+  ut_ad(alignment > 0);
+  ut_ad(ut_is_2pow(alignment));
+  ut_ad(ptr);
+  return reinterpret_cast<size_t>(ptr) & (alignment - 1);
+}
+
+/*****************************************************************//**
+Gets the nth bit of a ulint.
+@return TRUE if nth bit is 1; 0th bit is defined to be the least significant */
+UNIV_INLINE
+ibool
+ut_bit_get_nth(
+/*===========*/
+	ulint	a,	/*!< in: ulint */
+	ulint	n);	/*!< in: nth bit requested */
+
+#include "ut0byte.inl"
+
+#endif
diff --git a/storage/innobase/include/ut0byte.inl b/storage/innobase/include/ut0byte.inl
new file mode 100644
index 00000000..dfa069c2
--- /dev/null
+++ b/storage/innobase/include/ut0byte.inl
@@ -0,0 +1,90 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0byte.ic
+Utilities for byte operations
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+/*******************************************************//**
+Creates a 64-bit integer out of two 32-bit integers.
+@return created integer */
+UNIV_INLINE
+ib_uint64_t
+ut_ull_create(
+/*==========*/
+	ulint	high,	/*!< in: high-order 32 bits */
+	ulint	low)	/*!< in: low-order 32 bits */
+{
+	ut_ad(high <= ULINT32_MASK);
+	ut_ad(low <= ULINT32_MASK);
+	return(((ib_uint64_t) high) << 32 | low);
+}
+
+/********************************************************//**
+Rounds a 64-bit integer downward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_down(
+/*=================*/
+	ib_uint64_t	 n,		/*!< in: number to be rounded */
+	ulint		 align_no)	/*!< in: align by this number
+					which must be a power of 2 */
+{
+	ut_ad(align_no > 0);
+	ut_ad(ut_is_2pow(align_no));
+
+	return(n & ~((ib_uint64_t) align_no - 1));
+}
+
+/********************************************************//**
+Rounds ib_uint64_t upward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_up(
+/*===============*/
+	ib_uint64_t	 n,		/*!< in: number to be rounded */
+	ulint		 align_no)	/*!< in: align by this number
+					which must be a power of 2 */
+{
+	ib_uint64_t	align_1 = (ib_uint64_t) align_no - 1;
+
+	ut_ad(align_no > 0);
+	ut_ad(ut_is_2pow(align_no));
+
+	return((n + align_1) & ~align_1);
+}
+
+/*****************************************************************//**
+Gets the nth bit of a ulint.
+@return TRUE if nth bit is 1; 0th bit is defined to be the least significant */
+UNIV_INLINE
+ibool
+ut_bit_get_nth(
+/*===========*/
+	ulint	a,	/*!< in: ulint */
+	ulint	n)	/*!< in: nth bit requested */
+{
+	ut_ad(n < 8 * sizeof(ulint));
+	return(1 & (a >> n));
+}
diff --git a/storage/innobase/include/ut0counter.h b/storage/innobase/include/ut0counter.h
new file mode 100644
index 00000000..d6589cc4
--- /dev/null
+++ b/storage/innobase/include/ut0counter.h
@@ -0,0 +1,123 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ut0counter.h
+
+Counter utility class
+
+Created 2012/04/12 by Sunny Bains
+*******************************************************/
+
+#ifndef ut0counter_h
+#define ut0counter_h
+
+#include "univ.i"
+#include "my_rdtsc.h"
+
+/** Use the result of my_timer_cycles(), which mainly uses RDTSC for cycles
+as a random value. See the comments for my_timer_cycles() */
+/** @return result from RDTSC or similar functions. */
+static inline size_t
+get_rnd_value()
+{
+	size_t c = static_cast<size_t>(my_timer_cycles());
+
+	if (c != 0) {
+		return c;
+	}
+
+	/* We may go here if my_timer_cycles() returns 0,
+	so we have to have the plan B for the counter. */
+#if !defined(_WIN32)
+	return (size_t)pthread_self();
+#else
+	LARGE_INTEGER cnt;
+	QueryPerformanceCounter(&cnt);
+
+	return static_cast<size_t>(cnt.QuadPart);
+#endif /* !_WIN32 */
+}
+
+/** Atomic which occupies whole CPU cache line.
+Note: We rely on the default constructor of std::atomic and
+do not explicitly initialize the contents. This works for us,
+because ib_counter_t is only intended for usage with global
+memory that is allocated from the .bss and thus guaranteed to
+be zero-initialized by the run-time environment.
+@see srv_stats */
+template <typename Type>
+struct ib_atomic_counter_element_t {
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) Atomic_relaxed<Type> value;
+};
+
+template <typename Type>
+struct ib_counter_element_t {
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) Type value;
+};
+
+
+/** Class for using fuzzy counters. The counter is multi-instance relaxed atomic
+so the results are not guaranteed to be 100% accurate but close
+enough. */
+template <typename Type,
+          template <typename T> class Element = ib_atomic_counter_element_t,
+          int N = 128 >
+struct ib_counter_t {
+	/** Increment the counter by 1. */
+	void inc() { add(1); }
+	ib_counter_t& operator++() { inc(); return *this; }
+
+	/** Increment the counter by 1.
+	@param[in]	index	a reasonably thread-unique identifier */
+	void inc(size_t index) { add(index, 1); }
+
+	/** Add to the counter.
+	@param[in]	n	amount to be added */
+	void add(Type n) { add(get_rnd_value(), n); }
+
+	/** Add to the counter.
+	@param[in]	index	a reasonably thread-unique identifier
+	@param[in]	n	amount to be added */
+	TPOOL_SUPPRESS_TSAN void add(size_t index, Type n) {
+		index = index % N;
+
+		ut_ad(index < UT_ARR_SIZE(m_counter));
+
+		m_counter[index].value += n;
+	}
+
+	/* @return total value - not 100% accurate, since it is relaxed atomic*/
+	operator Type() const {
+		Type	total = 0;
+
+		for (const auto &counter : m_counter) {
+			total += counter.value;
+		}
+
+		return(total);
+	}
+
+private:
+	static_assert(sizeof(Element<Type>) == CPU_LEVEL1_DCACHE_LINESIZE, "");
+	/** Array of counter elements */
+	alignas(CPU_LEVEL1_DCACHE_LINESIZE) Element<Type> m_counter[N];
+};
+
+#endif /* ut0counter_h */
diff --git a/storage/innobase/include/ut0dbg.h b/storage/innobase/include/ut0dbg.h
new file mode 100644
index 00000000..85856660
--- /dev/null
+++ b/storage/innobase/include/ut0dbg.h
@@ -0,0 +1,179 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*****************************************************************//**
+@file include/ut0dbg.h
+Debug utilities for Innobase
+
+Created 1/30/1994 Heikki Tuuri
+**********************************************************************/
+
+#ifndef ut0dbg_h
+#define ut0dbg_h
+
+#ifdef UNIV_INNOCHECKSUM
+#define ut_a		assert
+#define ut_ad		assert
+#define ut_error	assert(0)
+#else /* !UNIV_INNOCHECKSUM */
+
+/* Do not include univ.i because univ.i includes this. */
+
+/*************************************************************//**
+Report a failed assertion. */
+ATTRIBUTE_NORETURN ATTRIBUTE_COLD __attribute__((nonnull(2)))
+void
+ut_dbg_assertion_failed(
+/*====================*/
+	const char*	expr,	/*!< in: the failed assertion */
+	const char*	file,	/*!< in: source file containing the assertion */
+	unsigned	line);	/*!< in: line number of the assertion */
+
+/** Abort execution if EXPR does not evaluate to nonzero.
+@param EXPR assertion expression that should hold */
+#define ut_a(EXPR) do {						\
+	if (UNIV_UNLIKELY(!(ulint) (EXPR))) {			\
+		ut_dbg_assertion_failed(#EXPR,			\
+				__FILE__, __LINE__);		\
+	}							\
+} while (0)
+
+/** Abort execution. */
+#define ut_error						\
+	ut_dbg_assertion_failed(0, __FILE__, __LINE__)
+
+/** Debug assertion */
+#define ut_ad	DBUG_SLOW_ASSERT
+#if defined(UNIV_DEBUG) || !defined(DBUG_OFF)
+/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_d(EXPR)	EXPR
+#else
+/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_d(EXPR)
+#endif
+
+#if defined(HAVE_SYS_TIME_H) && defined(HAVE_SYS_RESOURCE_H)
+
+#define HAVE_UT_CHRONO_T
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+/** A "chronometer" used to clock snippets of code.
+Example usage:
+	ut_chrono_t	ch("this loop");
+	for (;;) { ... }
+	ch.show();
+would print the timings of the for() loop, prefixed with "this loop:" */
+class ut_chrono_t {
+public:
+	/** Constructor.
+	@param[in]	name	chrono's name, used when showing the values */
+	ut_chrono_t(
+		const char*	name)
+		:
+		m_name(name),
+		m_show_from_destructor(true)
+	{
+		reset();
+	}
+
+	/** Resets the chrono (records the current time in it). */
+	void
+	reset()
+	{
+		gettimeofday(&m_tv, NULL);
+
+		getrusage(RUSAGE_SELF, &m_ru);
+	}
+
+	/** Shows the time elapsed and usage statistics since the last reset. */
+	void
+	show()
+	{
+		struct rusage	ru_now;
+		struct timeval	tv_now;
+		struct timeval	tv_diff;
+
+		getrusage(RUSAGE_SELF, &ru_now);
+
+		gettimeofday(&tv_now, NULL);
+
+#ifndef timersub
+#define timersub(a, b, r)						\
+		do {							\
+			(r)->tv_sec = (a)->tv_sec - (b)->tv_sec;	\
+			(r)->tv_usec = (a)->tv_usec - (b)->tv_usec;	\
+			if ((r)->tv_usec < 0) {				\
+				(r)->tv_sec--;				\
+				(r)->tv_usec += 1000000;		\
+			}						\
+		} while (0)
+#endif /* timersub */
+
+#define CHRONO_PRINT(type, tvp)						\
+		fprintf(stderr, "%s: %s% 5ld.%06ld sec\n",		\
+			m_name, type,					\
+			static_cast<long>((tvp)->tv_sec),		\
+			static_cast<long>((tvp)->tv_usec))
+
+		timersub(&tv_now, &m_tv, &tv_diff);
+		CHRONO_PRINT("real", &tv_diff);
+
+		timersub(&ru_now.ru_utime, &m_ru.ru_utime, &tv_diff);
+		CHRONO_PRINT("user", &tv_diff);
+
+		timersub(&ru_now.ru_stime, &m_ru.ru_stime, &tv_diff);
+		CHRONO_PRINT("sys ", &tv_diff);
+	}
+
+	/** Cause the timings not to be printed from the destructor. */
+	void end()
+	{
+		m_show_from_destructor = false;
+	}
+
+	/** Destructor. */
+	~ut_chrono_t()
+	{
+		if (m_show_from_destructor) {
+			show();
+		}
+	}
+
+private:
+	/** Name of this chronometer. */
+	const char*	m_name;
+
+	/** True if the current timings should be printed by the destructor. */
+	bool		m_show_from_destructor;
+
+	/** getrusage() result as of the last reset(). */
+	struct rusage	m_ru;
+
+	/** gettimeofday() result as of the last reset(). */
+	struct timeval	m_tv;
+};
+
+#endif /* HAVE_SYS_TIME_H && HAVE_SYS_RESOURCE_H */
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#endif
diff --git a/storage/innobase/include/ut0list.h b/storage/innobase/include/ut0list.h
new file mode 100644
index 00000000..765f6a2a
--- /dev/null
+++ b/storage/innobase/include/ut0list.h
@@ -0,0 +1,146 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0list.h
+A double-linked list
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/*******************************************************************//**
+A double-linked list. This differs from the one in ut0lst.h in that in this
+one, each list node contains a pointer to the data, whereas the one in
+ut0lst.h uses a strategy where the list pointers are embedded in the data
+items themselves.
+
+Use this one when you need to store arbitrary data in the list where you
+can't embed the list pointers in the data, if a data item needs to be
+stored in multiple lists, etc.
+
+Note about the memory management: ib_list_t is a fixed-size struct whose
+allocation/deallocation is done through ib_list_create/ib_list_free, but the
+memory for the list nodes is allocated through a user-given memory heap,
+which can either be the same for all nodes or vary per node. Most users will
+probably want to create a memory heap to store the item-specific data, and
+pass in this same heap to the list node creation functions, thus
+automatically freeing the list node when the item's heap is freed.
+
+************************************************************************/
+
+#ifndef IB_LIST_H
+#define IB_LIST_H
+
+#include "mem0mem.h"
+
+struct ib_list_t;
+struct ib_list_node_t;
+
+/****************************************************************//**
+Create a new list using mem_alloc. Lists created with this function must be
+freed with ib_list_free.
+@return list */
+ib_list_t*
+ib_list_create(void);
+/*=================*/
+
+/****************************************************************//**
+Free a list. */
+void
+ib_list_free(
+/*=========*/
+	ib_list_t*	list);	/*!< in: list */
+
+/****************************************************************//**
+Add the data to the end of the list.
+@return new list node */
+ib_list_node_t*
+ib_list_add_last(
+/*=============*/
+	ib_list_t*	list,	/*!< in: list */
+	void*		data,	/*!< in: data */
+	mem_heap_t*	heap);	/*!< in: memory heap to use */
+
+/****************************************************************//**
+Remove the node from the list. */
+void
+ib_list_remove(
+/*===========*/
+	ib_list_t*	list,	/*!< in: list */
+	ib_list_node_t*	node);	/*!< in: node to remove */
+
+/****************************************************************//**
+Get the first node in the list.
+@return first node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_first(
+/*==============*/
+	ib_list_t*	list);	/*!< in: list */
+
+/****************************************************************//**
+Get the last node in the list.
+@return last node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_last(
+/*=============*/
+	ib_list_t*	list);	/*!< in: list */
+
+/********************************************************************
+Check if list is empty. */
+UNIV_INLINE
+ibool
+ib_list_is_empty(
+/*=============*/
+					/* out: TRUE if empty else  */
+	const ib_list_t*	list);	/* in: list */
+
+/********************************************************************
+Get number of items on list.
+@return number of items on list */
+UNIV_INLINE
+ulint
+ib_list_len(
+/*========*/
+	const ib_list_t*	list);		/*<! in: list */
+
+/* List. */
+struct ib_list_t {
+	ib_list_node_t*		first;		/*!< first node */
+	ib_list_node_t*		last;		/*!< last node */
+};
+
+/* A list node. */
+struct ib_list_node_t {
+	ib_list_node_t*		prev;		/*!< previous node */
+	ib_list_node_t*		next;		/*!< next node */
+	void*			data;		/*!< user data */
+};
+
+/* Quite often, the only additional piece of data you need is the per-item
+memory heap, so we have this generic struct available to use in those
+cases. */
+struct ib_list_helper_t {
+	mem_heap_t*	heap;		/*!< memory heap */
+	void*		data;		/*!< user data */
+};
+
+#include "ut0list.inl"
+
+#endif
diff --git a/storage/innobase/include/ut0list.inl b/storage/innobase/include/ut0list.inl
new file mode 100644
index 00000000..3bdba52b
--- /dev/null
+++ b/storage/innobase/include/ut0list.inl
@@ -0,0 +1,80 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0list.ic
+A double-linked list
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/****************************************************************//**
+Get the first node in the list.
+@return first node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_first(
+/*==============*/
+	ib_list_t*	list)	/*!< in: list */
+{
+	return(list->first);
+}
+
+/****************************************************************//**
+Get the last node in the list.
+@return last node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_last(
+/*=============*/
+	ib_list_t*	list)	/*!< in: list */
+{
+	return(list->last);
+}
+
+/********************************************************************
+Check if list is empty. */
+UNIV_INLINE
+ibool
+ib_list_is_empty(
+/*=============*/
+					/* out: TRUE if empty else FALSE */
+	const ib_list_t*	list)	/* in: list */
+{
+	return(!(list->first || list->last));
+}
+
+/********************************************************************
+Get number of items on list.
+@return number of items on list */
+UNIV_INLINE
+ulint
+ib_list_len(
+/*========*/
+	const ib_list_t*	list)		/*<! in: list */
+{
+	ulint len = 0;
+	ib_list_node_t* node = list->first;
+
+	while(node) {
+		len++;
+		node = node->next;
+	}
+
+	return (len);
+}
diff --git a/storage/innobase/include/ut0lst.h b/storage/innobase/include/ut0lst.h
new file mode 100644
index 00000000..7b7ed7b8
--- /dev/null
+++ b/storage/innobase/include/ut0lst.h
@@ -0,0 +1,563 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0lst.h
+List utilities
+
+Created 9/10/1995 Heikki Tuuri
+Rewritten by Sunny Bains Dec 2011.
+***********************************************************************/
+
+#pragma once
+
+/* Do not include univ.i because univ.i includes this. */
+
+#include "ut0dbg.h"
+
+/* This module implements the two-way linear list. Note that a single
+list node may belong to two or more lists, but is only on one list
+at a time. */
+
+/*******************************************************************//**
+The two way list node.
+@param TYPE the list node type name */
+template <typename Type>
+struct ut_list_node {
+	Type*		prev;			/*!< pointer to the previous
+						node, NULL if start of list */
+	Type*		next;			/*!< pointer to next node,
+						NULL if end of list */
+
+	void reverse()
+	{
+		Type*	tmp = prev;
+		prev = next;
+		next = tmp;
+	}
+};
+
+/** Macro used for legacy reasons */
+#define UT_LIST_NODE_T(t)		ut_list_node<t>
+
+/*******************************************************************//**
+The two-way list base node. The base node contains pointers to both ends
+of the list and a count of nodes in the list (excluding the base node
+from the count). We also store a pointer to the member field so that it
+doesn't have to be specified when doing list operations.
+@param Type the type of the list element
+@param NodePtr field member pointer that points to the list node */
+template <typename Type, typename NodePtr>
+struct ut_list_base {
+	typedef Type elem_type;
+	typedef NodePtr node_ptr;
+	typedef ut_list_node<Type> node_type;
+
+	ulint		count;			/*!< count of nodes in list */
+	elem_type*	start;			/*!< pointer to list start,
+						NULL if empty */
+	elem_type*	end;			/*!< pointer to list end,
+						NULL if empty */
+	node_ptr	node;			/*!< Pointer to member field
+						that is used as a link node */
+#ifdef UNIV_DEBUG
+	ulint		init;			/*!< UT_LIST_INITIALISED if
+						the list was initialised with
+						UT_LIST_INIT() */
+#endif /* UNIV_DEBUG */
+
+	void reverse()
+	{
+		Type*	tmp = start;
+		start = end;
+		end = tmp;
+	}
+};
+
+#define UT_LIST_BASE_NODE_T(t)	ut_list_base<t, ut_list_node<t> t::*>
+
+#ifdef UNIV_DEBUG
+# define UT_LIST_INITIALISED		0xCAFE
+# define UT_LIST_INITIALISE(b)		(b).init = UT_LIST_INITIALISED
+# define UT_LIST_IS_INITIALISED(b)	ut_a(((b).init == UT_LIST_INITIALISED))
+#else
+# define UT_LIST_INITIALISE(b)
+# define UT_LIST_IS_INITIALISED(b)
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Note: This is really the list constructor. We should be able to use
+placement new here.
+Initializes the base node of a two-way list.
+@param b the list base node
+@param pmf point to member field that will be used as the link node */
+#define UT_LIST_INIT(b, pmf)						\
+{									\
+	(b).count = 0;							\
+	(b).start = 0;							\
+	(b).end   = 0;							\
+	(b).node  = pmf;						\
+	UT_LIST_INITIALISE(b);						\
+}
+
+/** Functor for accessing the embedded node within a list element. This is
+required because some lists can have the node emebedded inside a nested
+struct/union. See lock0priv.h (table locks) for an example. It provides a
+specialised functor to grant access to the list node. */
+template <typename Type>
+struct GenericGetNode {
+
+	typedef ut_list_node<Type> node_type;
+
+	GenericGetNode(node_type Type::* node) : m_node(node) {}
+
+	node_type& operator() (Type& elem)
+	{
+		return(elem.*m_node);
+	}
+
+	node_type	Type::*m_node;
+};
+
+/*******************************************************************//**
+Adds the node as the first element in a two-way linked list.
+@param list the base node (not a pointer to it)
+@param elem the element to add */
+template <typename List>
+void
+ut_list_prepend(
+	List&				list,
+	typename List::elem_type*	elem)
+{
+	typename List::node_type&	elem_node = elem->*list.node;
+
+	UT_LIST_IS_INITIALISED(list);
+
+	elem_node.prev = 0;
+	elem_node.next = list.start;
+
+	if (list.start != 0) {
+		typename List::node_type&	base_node =
+			list.start->*list.node;
+
+		ut_ad(list.start != elem);
+
+		base_node.prev = elem;
+	}
+
+	list.start = elem;
+
+	if (list.end == 0) {
+		list.end = elem;
+	}
+
+	++list.count;
+}
+
+/*******************************************************************//**
+Adds the node as the first element in a two-way linked list.
+@param LIST the base node (not a pointer to it)
+@param ELEM the element to add */
+#define UT_LIST_ADD_FIRST(LIST, ELEM)	ut_list_prepend(LIST, ELEM)
+
+/*******************************************************************//**
+Adds the node as the last element in a two-way linked list.
+@param list list
+@param elem the element to add
+@param get_node to get the list node for that element */
+template <typename List, typename Functor>
+void
+ut_list_append(
+	List&				list,
+	typename List::elem_type*	elem,
+	Functor				get_node)
+{
+	typename List::node_type&	node = get_node(*elem);
+
+	UT_LIST_IS_INITIALISED(list);
+
+	node.next = 0;
+	node.prev = list.end;
+
+	if (list.end != 0) {
+		typename List::node_type&	base_node = get_node(*list.end);
+
+		ut_ad(list.end != elem);
+
+		base_node.next = elem;
+	}
+
+	list.end = elem;
+
+	if (list.start == 0) {
+		list.start = elem;
+	}
+
+	++list.count;
+}
+
+/*******************************************************************//**
+Adds the node as the last element in a two-way linked list.
+@param list list
+@param elem the element to add */
+template <typename List>
+void
+ut_list_append(
+	List&				list,
+	typename List::elem_type*	elem)
+{
+	ut_list_append(
+		list, elem,
+		GenericGetNode<typename List::elem_type>(list.node));
+}
+
+/*******************************************************************//**
+Adds the node as the last element in a two-way linked list.
+@param LIST list base node (not a pointer to it)
+@param ELEM the element to add */
+#define UT_LIST_ADD_LAST(LIST, ELEM)	ut_list_append(LIST, ELEM)
+
+/*******************************************************************//**
+Inserts a ELEM2 after ELEM1 in a list.
+@param list the base node
+@param elem1 node after which ELEM2 is inserted
+@param elem2 node being inserted after ELEM1 */
+template <typename List>
+void
+ut_list_insert(
+	List&				list,
+	typename List::elem_type*	elem1,
+	typename List::elem_type*	elem2)
+{
+	ut_ad(elem1 != elem2);
+	UT_LIST_IS_INITIALISED(list);
+
+	typename List::node_type&	elem1_node = elem1->*list.node;
+	typename List::node_type&	elem2_node = elem2->*list.node;
+
+	elem2_node.prev = elem1;
+	elem2_node.next = elem1_node.next;
+
+	if (elem1_node.next != NULL) {
+		typename List::node_type&	next_node =
+			elem1_node.next->*list.node;
+
+		next_node.prev = elem2;
+	}
+
+	elem1_node.next = elem2;
+
+	if (list.end == elem1) {
+		list.end = elem2;
+	}
+
+	++list.count;
+}
+
+/*******************************************************************//**
+Inserts a ELEM2 after ELEM1 in a list.
+@param LIST list base node (not a pointer to it)
+@param ELEM1 node after which ELEM2 is inserted
+@param ELEM2 node being inserted after ELEM1 */
+#define UT_LIST_INSERT_AFTER(LIST, ELEM1, ELEM2)			\
+	ut_list_insert(LIST, ELEM1, ELEM2)
+
+/*******************************************************************//**
+Inserts a ELEM2 after ELEM1 in a list.
+@param list the base node
+@param elem1 node after which ELEM2 is inserted
+@param elem2 node being inserted after ELEM1
+@param get_node to get the list node for that element */
+
+template <typename List, typename Functor>
+void
+ut_list_insert(
+	List&				list,
+	typename List::elem_type*	elem1,
+        typename List::elem_type*	elem2,
+	Functor				get_node)
+{
+	ut_ad(elem1 != elem2);
+	UT_LIST_IS_INITIALISED(list);
+
+	typename List::node_type&	elem1_node = get_node(*elem1);
+	typename List::node_type&	elem2_node = get_node(*elem2);
+
+	elem2_node.prev = elem1;
+	elem2_node.next = elem1_node.next;
+
+	if (elem1_node.next != NULL) {
+		typename List::node_type&	next_node =
+			get_node(*elem1_node.next);
+
+		next_node.prev = elem2;
+	}
+
+	elem1_node.next = elem2;
+
+	if (list.end == elem1) {
+		list.end = elem2;
+	}
+
+	++list.count;
+
+}
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+@param list the base node (not a pointer to it)
+@param node member node within list element that is to be removed
+@param get_node functor to get the list node from elem */
+template <typename List, typename Functor>
+void
+ut_list_remove(
+	List&				list,
+	typename List::node_type&	node,
+	Functor				get_node)
+{
+	ut_a(list.count > 0);
+	UT_LIST_IS_INITIALISED(list);
+
+	if (node.next != NULL) {
+		typename List::node_type&	next_node =
+			get_node(*node.next);
+
+		next_node.prev = node.prev;
+	} else {
+		list.end = node.prev;
+	}
+
+	if (node.prev != NULL) {
+		typename List::node_type&	prev_node =
+			get_node(*node.prev);
+
+		prev_node.next = node.next;
+	} else {
+		list.start = node.next;
+	}
+
+	node.next = 0;
+	node.prev = 0;
+
+	--list.count;
+}
+
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+@param list the base node (not a pointer to it)
+@param elem element to be removed from the list
+@param get_node functor to get the list node from elem */
+template <typename List, typename Functor>
+void
+ut_list_remove(
+	List&				list,
+	typename List::elem_type*	elem,
+	Functor				get_node)
+{
+	ut_list_remove(list, get_node(*elem), get_node);
+}
+
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+@param list the base node (not a pointer to it)
+@param elem element to be removed from the list */
+template <typename List>
+void
+ut_list_remove(
+	List&				list,
+	typename List::elem_type*	elem)
+{
+	ut_list_remove(
+		list, elem->*list.node,
+		GenericGetNode<typename List::elem_type>(list.node));
+}
+
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+@param LIST the base node (not a pointer to it)
+@param ELEM node to be removed from the list */
+#define UT_LIST_REMOVE(LIST, ELEM)	ut_list_remove(LIST, ELEM)
+
+/********************************************************************//**
+Gets the next node in a two-way list.
+@param NAME list name
+@param N pointer to a node
+@return the successor of N in NAME, or NULL */
+#define UT_LIST_GET_NEXT(NAME, N)	(((N)->NAME).next)
+
+/********************************************************************//**
+Gets the previous node in a two-way list.
+@param NAME list name
+@param N pointer to a node
+@return the predecessor of N in NAME, or NULL */
+#define UT_LIST_GET_PREV(NAME, N)	(((N)->NAME).prev)
+
+/********************************************************************//**
+Alternative macro to get the number of nodes in a two-way list, i.e.,
+its length.
+@param BASE the base node (not a pointer to it).
+@return the number of nodes in the list */
+#define UT_LIST_GET_LEN(BASE)		(BASE).count
+
+/********************************************************************//**
+Gets the first node in a two-way list.
+@param BASE the base node (not a pointer to it)
+@return first node, or NULL if the list is empty */
+#define UT_LIST_GET_FIRST(BASE)		(BASE).start
+
+/********************************************************************//**
+Gets the last node in a two-way list.
+@param BASE the base node (not a pointer to it)
+@return last node, or NULL if the list is empty */
+#define UT_LIST_GET_LAST(BASE)		(BASE).end
+
+struct NullValidate { void operator()(const void*) const {} };
+
+/** Iterate over all the elements and call the functor for each element.
+@param[in]	list	base node (not a pointer to it)
+@param[in,out]	functor	Functor that is called for each element in the list */
+template <typename List, class Functor>
+inline void ut_list_map(const List& list, Functor& functor)
+{
+	ulint count = 0;
+
+	UT_LIST_IS_INITIALISED(list);
+
+	for (typename List::elem_type* elem = list.start; elem;
+	     elem = (elem->*list.node).next, ++count) {
+
+		functor(elem);
+	}
+
+	ut_a(count == list.count);
+}
+
+/** Iterate over all the elements and call the functor for each element.
+@param[in]	list	base node (not a pointer to it)
+@param[in]	functor	Functor that is called for each element in the list */
+template <typename List, class Functor>
+inline void ut_list_map(const List& list, const Functor& functor)
+{
+	ulint count = 0;
+
+	UT_LIST_IS_INITIALISED(list);
+
+	for (typename List::elem_type* elem = list.start; elem;
+	     elem = (elem->*list.node).next, ++count) {
+
+		functor(elem);
+	}
+
+	ut_a(count == list.count);
+}
+
+/** Check the consistency of a doubly linked list.
+@param[in] list		base node (not a pointer to it)
+@param[in,out] functor	Functor that is called for each element in the list */
+template <typename List, class Functor>
+void ut_list_validate(const List& list, Functor& functor)
+{
+	ut_list_map(list, functor);
+#ifdef UNIV_DEBUG
+	/* Validate the list backwards. */
+	ulint count = list.count;
+
+	for (typename List::elem_type* elem = list.end;
+	     elem != 0;
+	     elem = (elem->*list.node).prev) {
+		--count;
+	}
+	ut_ad(!count);
+#endif
+}
+
+/** Check the consistency of a doubly linked list.
+@param[in] list		base node (not a pointer to it)
+@param[in] functor	Functor that is called for each element in the list */
+template <typename List, class Functor>
+inline void ut_list_validate(const List& list, const Functor& functor)
+{
+	ut_list_map(list, functor);
+#ifdef UNIV_DEBUG
+	/* Validate the list backwards. */
+	ulint count = list.count;
+
+	for (typename List::elem_type* elem = list.end;
+	     elem != 0;
+	     elem = (elem->*list.node).prev) {
+		--count;
+	}
+
+	ut_ad(!count);
+#endif
+}
+
+template <typename List>
+inline void ut_list_validate(const List& list)
+{
+  ut_d(ut_list_validate(list, NullValidate()));
+}
+
+#ifdef UNIV_DEBUG
+template <typename List>
+inline void ut_list_reverse(List& list)
+{
+	UT_LIST_IS_INITIALISED(list);
+
+	for (typename List::elem_type* elem = list.start;
+	     elem != 0;
+	     elem = (elem->*list.node).prev) {
+		(elem->*list.node).reverse();
+	}
+
+	list.reverse();
+}
+
+/** Check if the given element exists in the list.
+@param[in,out]	list	the list object
+@param[in]	elem	the element of the list which will be checked */
+template <typename List>
+inline bool ut_list_exists(const List& list, typename List::elem_type* elem)
+{
+	for (typename List::elem_type* e1 = UT_LIST_GET_FIRST(list); e1;
+	     e1 = (e1->*list.node).next) {
+		if (elem == e1) {
+			return true;
+		}
+	}
+	return false;
+}
+#endif
+
+/** Move the given element to the beginning of the list.
+@param[in,out]	list	the list object
+@param[in]	elem	the element of the list which will be moved
+			to the beginning of the list. */
+template <typename List>
+void
+ut_list_move_to_front(
+	List&				list,
+	typename List::elem_type*	elem)
+{
+	ut_ad(ut_list_exists(list, elem));
+
+	if (UT_LIST_GET_FIRST(list) != elem) {
+		ut_list_remove(list, elem);
+		ut_list_prepend(list, elem);
+	}
+}
diff --git a/storage/innobase/include/ut0mem.h b/storage/innobase/include/ut0mem.h
new file mode 100644
index 00000000..a5ed72f9
--- /dev/null
+++ b/storage/innobase/include/ut0mem.h
@@ -0,0 +1,76 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0mem.h
+Memory primitives
+
+Created 5/30/1994 Heikki Tuuri
+************************************************************************/
+
+#ifndef ut0mem_h
+#define ut0mem_h
+
+#include "univ.i"
+
+/********************************************************************
+Concatenate 3 strings.*/
+char*
+ut_str3cat(
+/*=======*/
+				/* out, own: concatenated string, must be
+				freed with ut_free() */
+	const char*	s1,	/* in: string 1 */
+	const char*	s2,	/* in: string 2 */
+	const char*	s3);	/* in: string 3 */
+
+/**********************************************************************//**
+Converts a raw binary data to a NUL-terminated hex string. The output is
+truncated if there is not enough space in "hex", make sure "hex_size" is at
+least (2 * raw_size + 1) if you do not want this to happen. Returns the
+actual number of characters written to "hex" (including the NUL).
+@return number of chars written */
+UNIV_INLINE
+ulint
+ut_raw_to_hex(
+/*==========*/
+	const void*	raw,		/*!< in: raw data */
+	ulint		raw_size,	/*!< in: "raw" length in bytes */
+	char*		hex,		/*!< out: hex string */
+	ulint		hex_size);	/*!< in: "hex" size in bytes */
+
+/*******************************************************************//**
+Adds single quotes to the start and end of string and escapes any quotes
+by doubling them. Returns the number of bytes that were written to "buf"
+(including the terminating NUL). If buf_size is too small then the
+trailing bytes from "str" are discarded.
+@return number of bytes that were written */
+UNIV_INLINE
+ulint
+ut_str_sql_format(
+/*==============*/
+	const char*	str,		/*!< in: string */
+	ulint		str_len,	/*!< in: string length in bytes */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size);	/*!< in: output buffer size
+					in bytes */
+
+#include "ut0mem.inl"
+
+#endif
diff --git a/storage/innobase/include/ut0mem.inl b/storage/innobase/include/ut0mem.inl
new file mode 100644
index 00000000..cc95a036
--- /dev/null
+++ b/storage/innobase/include/ut0mem.inl
@@ -0,0 +1,246 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0mem.ic
+Memory primitives
+
+Created 5/30/1994 Heikki Tuuri
+************************************************************************/
+
+#include "ut0byte.h"
+#include "mach0data.h"
+
+/**********************************************************************//**
+Converts a raw binary data to a NUL-terminated hex string. The output is
+truncated if there is not enough space in "hex", make sure "hex_size" is at
+least (2 * raw_size + 1) if you do not want this to happen. Returns the
+actual number of characters written to "hex" (including the NUL).
+@return number of chars written */
+UNIV_INLINE
+ulint
+ut_raw_to_hex(
+/*==========*/
+	const void*	raw,		/*!< in: raw data */
+	ulint		raw_size,	/*!< in: "raw" length in bytes */
+	char*		hex,		/*!< out: hex string */
+	ulint		hex_size)	/*!< in: "hex" size in bytes */
+{
+
+#ifdef WORDS_BIGENDIAN
+
+#define MK_UINT16(a, b) (((uint16) (a)) << 8 | (uint16) (b))
+
+#define UINT16_GET_A(u)	((char) ((u) >> 8))
+#define UINT16_GET_B(u)	((char) ((u) & 0xFF))
+
+#else /* WORDS_BIGENDIAN */
+
+#define MK_UINT16(a, b) (((uint16) (b)) << 8 | (uint16) (a))
+
+#define UINT16_GET_A(u)	((char) ((u) & 0xFF))
+#define UINT16_GET_B(u)	((char) ((u) >> 8))
+
+#endif /* WORDS_BIGENDIAN */
+
+#define MK_ALL_UINT16_WITH_A(a)	\
+	MK_UINT16(a, '0'),	\
+	MK_UINT16(a, '1'),	\
+	MK_UINT16(a, '2'),	\
+	MK_UINT16(a, '3'),	\
+	MK_UINT16(a, '4'),	\
+	MK_UINT16(a, '5'),	\
+	MK_UINT16(a, '6'),	\
+	MK_UINT16(a, '7'),	\
+	MK_UINT16(a, '8'),	\
+	MK_UINT16(a, '9'),	\
+	MK_UINT16(a, 'A'),	\
+	MK_UINT16(a, 'B'),	\
+	MK_UINT16(a, 'C'),	\
+	MK_UINT16(a, 'D'),	\
+	MK_UINT16(a, 'E'),	\
+	MK_UINT16(a, 'F')
+
+	static const uint16	hex_map[256] = {
+		MK_ALL_UINT16_WITH_A('0'),
+		MK_ALL_UINT16_WITH_A('1'),
+		MK_ALL_UINT16_WITH_A('2'),
+		MK_ALL_UINT16_WITH_A('3'),
+		MK_ALL_UINT16_WITH_A('4'),
+		MK_ALL_UINT16_WITH_A('5'),
+		MK_ALL_UINT16_WITH_A('6'),
+		MK_ALL_UINT16_WITH_A('7'),
+		MK_ALL_UINT16_WITH_A('8'),
+		MK_ALL_UINT16_WITH_A('9'),
+		MK_ALL_UINT16_WITH_A('A'),
+		MK_ALL_UINT16_WITH_A('B'),
+		MK_ALL_UINT16_WITH_A('C'),
+		MK_ALL_UINT16_WITH_A('D'),
+		MK_ALL_UINT16_WITH_A('E'),
+		MK_ALL_UINT16_WITH_A('F')
+	};
+	const unsigned char*	rawc;
+	ulint			read_bytes;
+	ulint			write_bytes;
+	ulint			i;
+
+	rawc = (const unsigned char*) raw;
+
+	if (hex_size == 0) {
+
+		return(0);
+	}
+
+	if (hex_size <= 2 * raw_size) {
+
+		read_bytes = hex_size / 2;
+		write_bytes = hex_size;
+	} else {
+
+		read_bytes = raw_size;
+		write_bytes = 2 * raw_size + 1;
+	}
+
+#define LOOP_READ_BYTES(ASSIGN)			\
+	for (i = 0; i < read_bytes; i++) {	\
+		ASSIGN;				\
+		hex += 2;			\
+		rawc++;				\
+	}
+
+	if (ut_align_offset(hex, 2) == 0) {
+
+		LOOP_READ_BYTES(
+			*(uint16*) hex = hex_map[*rawc]
+		);
+	} else {
+
+		LOOP_READ_BYTES(
+			*hex       = UINT16_GET_A(hex_map[*rawc]);
+			*(hex + 1) = UINT16_GET_B(hex_map[*rawc])
+		);
+	}
+
+	if (hex_size <= 2 * raw_size && hex_size % 2 == 0) {
+
+		hex--;
+	}
+
+	*hex = '\0';
+
+	return(write_bytes);
+}
+
+/*******************************************************************//**
+Adds single quotes to the start and end of string and escapes any quotes
+by doubling them. Returns the number of bytes that were written to "buf"
+(including the terminating NUL). If buf_size is too small then the
+trailing bytes from "str" are discarded.
+@return number of bytes that were written */
+UNIV_INLINE
+ulint
+ut_str_sql_format(
+/*==============*/
+	const char*	str,		/*!< in: string */
+	ulint		str_len,	/*!< in: string length in bytes */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size)	/*!< in: output buffer size
+					in bytes */
+{
+	ulint	str_i;
+	ulint	buf_i;
+
+	buf_i = 0;
+
+	switch (buf_size) {
+	case 3:
+
+		if (str_len == 0) {
+
+			buf[buf_i] = '\'';
+			buf_i++;
+			buf[buf_i] = '\'';
+			buf_i++;
+		}
+		/* FALLTHROUGH */
+	case 2:
+	case 1:
+
+		buf[buf_i] = '\0';
+		buf_i++;
+		/* FALLTHROUGH */
+	case 0:
+
+		return(buf_i);
+	}
+
+	/* buf_size >= 4 */
+
+	buf[0] = '\'';
+	buf_i = 1;
+
+	for (str_i = 0; str_i < str_len; str_i++) {
+
+		char	ch;
+
+		if (buf_size - buf_i == 2) {
+
+			break;
+		}
+
+		ch = str[str_i];
+
+		switch (ch) {
+		case '\0':
+
+			if (buf_size - buf_i < 4) {
+
+				goto func_exit;
+			}
+			buf[buf_i] = '\\';
+			buf_i++;
+			buf[buf_i] = '0';
+			buf_i++;
+			break;
+		case '\'':
+		case '\\':
+
+			if (buf_size - buf_i < 4) {
+
+				goto func_exit;
+			}
+			buf[buf_i] = ch;
+			buf_i++;
+			/* FALLTHROUGH */
+		default:
+
+			buf[buf_i] = ch;
+			buf_i++;
+		}
+	}
+
+func_exit:
+
+	buf[buf_i] = '\'';
+	buf_i++;
+	buf[buf_i] = '\0';
+	buf_i++;
+
+	return(buf_i);
+}
diff --git a/storage/innobase/include/ut0new.h b/storage/innobase/include/ut0new.h
new file mode 100644
index 00000000..f4183e4c
--- /dev/null
+++ b/storage/innobase/include/ut0new.h
@@ -0,0 +1,1099 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ut/ut0new.h
+Instrumented memory allocator.
+
+Created May 26, 2014 Vasil Dimov
+*******************************************************/
+
+/** Dynamic memory allocation within InnoDB guidelines.
+All dynamic (heap) memory allocations (malloc(3), strdup(3), etc, "new",
+various std:: containers that allocate memory internally), that are done
+within InnoDB are instrumented. This means that InnoDB uses a custom set
+of functions for allocating memory, rather than calling e.g. "new" directly.
+
+Here follows a cheat sheet on what InnoDB functions to use whenever a
+standard one would have been used.
+
+Creating new objects with "new":
+--------------------------------
+Standard:
+  new expression
+  or
+  new(std::nothrow) expression
+InnoDB, default instrumentation:
+  UT_NEW_NOKEY(expression)
+InnoDB, custom instrumentation, preferred:
+  UT_NEW(expression, key)
+
+Destroying objects, created with "new":
+---------------------------------------
+Standard:
+  delete ptr
+InnoDB:
+  UT_DELETE(ptr)
+
+Creating new arrays with "new[]":
+---------------------------------
+Standard:
+  new type[num]
+  or
+  new(std::nothrow) type[num]
+InnoDB, default instrumentation:
+  UT_NEW_ARRAY_NOKEY(type, num)
+InnoDB, custom instrumentation, preferred:
+  UT_NEW_ARRAY(type, num, key)
+
+Destroying arrays, created with "new[]":
+----------------------------------------
+Standard:
+  delete[] ptr
+InnoDB:
+  UT_DELETE_ARRAY(ptr)
+
+Declaring a type with a std:: container, e.g. std::vector:
+----------------------------------------------------------
+Standard:
+  std::vector<t>
+InnoDB:
+  std::vector<t, ut_allocator<t> >
+
+Declaring objects of some std:: type:
+-------------------------------------
+Standard:
+  std::vector<t> v
+InnoDB, default instrumentation:
+  std::vector<t, ut_allocator<t> > v
+InnoDB, custom instrumentation, preferred:
+  std::vector<t, ut_allocator<t> > v(ut_allocator<t>(key))
+
+Raw block allocation (as usual in C++, consider whether using "new" would
+not be more appropriate):
+-------------------------------------------------------------------------
+Standard:
+  malloc(num)
+InnoDB, default instrumentation:
+  ut_malloc_nokey(num)
+InnoDB, custom instrumentation, preferred:
+  ut_malloc(num, key)
+
+Raw block resize:
+-----------------
+Standard:
+  realloc(ptr, new_size)
+InnoDB:
+  ut_realloc(ptr, new_size)
+
+Raw block deallocation:
+-----------------------
+Standard:
+  free(ptr)
+InnoDB:
+  ut_free(ptr)
+
+Note: the expression passed to UT_NEW() or UT_NEW_NOKEY() must always end
+with (), thus:
+Standard:
+  new int
+InnoDB:
+  UT_NEW_NOKEY(int())
+*/
+
+#ifndef ut0new_h
+#define ut0new_h
+
+#include <limits> /* std::numeric_limits */
+#include <thread>
+
+#include <stddef.h>
+#include <stdlib.h> /* malloc() */
+#include <string.h> /* strlen(), strrchr(), strncmp() */
+
+#include <my_sys.h> /* my_large_free/malloc() */
+
+#include "my_global.h" /* needed for headers from mysql/psi/ */
+
+#include "mysql/psi/mysql_memory.h" /* PSI_MEMORY_CALL() */
+
+#include "mysql/psi/psi_memory.h" /* PSI_memory_key, PSI_memory_info */
+
+#include "ut0ut.h" /* ut_strcmp_functor */
+
+#define	OUT_OF_MEMORY_MSG \
+	"Check if you should increase the swap file or ulimits of your" \
+	" operating system. Note that on most 32-bit computers the process" \
+	" memory space is limited to 2 GB or 4 GB."
+
+/** The total amount of memory currently allocated from the operating
+system with allocate_large() */
+extern Atomic_counter<ulint> os_total_large_mem_allocated;
+
+/** Maximum number of retries to allocate memory. */
+extern const size_t	alloc_max_retries;
+
+constexpr uint32_t INVALID_AUTOEVENT_IDX = 0xFFFFFFFFU;
+
+/** Keys for registering allocations with performance schema.
+Pointers to these variables are supplied to PFS code via the pfs_info[]
+array and the PFS code initializes them via PSI_MEMORY_CALL(register_memory)().
+mem_key_other and mem_key_std are special in the following way (see also
+ut_allocator::get_mem_key()):
+* If the caller has not provided a key and the file name of the caller is
+  unknown, then mem_key_std will be used. This happens only when called from
+  within std::* containers.
+* If the caller has not provided a key and the file name of the caller is
+  known, but is not amongst the predefined names (see ut_new_boot()) then
+  mem_key_other will be used. Generally this should not happen and if it
+  happens then that means that the list of predefined names must be extended.
+Keep this list alphabetically sorted. */
+extern PSI_memory_key	mem_key_ahi;
+extern PSI_memory_key	mem_key_buf_buf_pool;
+extern PSI_memory_key	mem_key_dict_stats_bg_recalc_pool_t;
+extern PSI_memory_key	mem_key_dict_stats_index_map_t;
+extern PSI_memory_key	mem_key_dict_stats_n_diff_on_level;
+extern PSI_memory_key	mem_key_other;
+extern PSI_memory_key	mem_key_row_log_buf;
+extern PSI_memory_key	mem_key_row_merge_sort;
+extern PSI_memory_key	mem_key_std;
+
+/** Setup the internal objects needed for UT_NEW() to operate.
+This must be called before the first call to UT_NEW(). */
+void
+ut_new_boot();
+
+#ifdef UNIV_PFS_MEMORY
+
+/**
+Retrieve a memory key (registered with PFS),
+given AUTOEVENT_IDX of the caller
+
+@param[in] autoevent_idx - AUTOEVENT_IDX value of the caller
+@return registered memory key or PSI_NOT_INSTRUMENTED */
+PSI_memory_key ut_new_get_key_by_file(uint32_t autoevent_idx);
+
+#endif /* UNIV_PFS_MEMORY */
+
+/** A structure that holds the necessary data for performance schema
+accounting. An object of this type is put in front of each allocated block
+of memory when allocation is done by ut_allocator::allocate(). This is
+because the data is needed even when freeing the memory. Users of
+ut_allocator::allocate_large() are responsible for maintaining this
+themselves. */
+struct ut_new_pfx_t {
+
+#ifdef UNIV_PFS_MEMORY
+
+	/** Performance schema key. Assigned to a name at startup via
+	PSI_MEMORY_CALL(register_memory)() and later used for accounting
+	allocations and deallocations with
+	PSI_MEMORY_CALL(memory_alloc)(key, size, owner) and
+	PSI_MEMORY_CALL(memory_free)(key, size, owner). */
+	PSI_memory_key	m_key;
+
+        /**
+          Thread owner.
+          Instrumented thread that owns the allocated memory.
+          This state is used by the performance schema to maintain
+          per thread statistics,
+          when memory is given from thread A to thread B.
+        */
+        struct PSI_thread *m_owner;
+
+#endif /* UNIV_PFS_MEMORY */
+
+	/** Size of the allocated block in bytes, including this prepended
+	aux structure (for ut_allocator::allocate()). For example if InnoDB
+	code requests to allocate 100 bytes, and sizeof(ut_new_pfx_t) is 16,
+	then 116 bytes are allocated in total and m_size will be 116.
+	ut_allocator::allocate_large() does not prepend this struct to the
+	allocated block and its users are responsible for maintaining it
+	and passing it later to ut_allocator::deallocate_large(). */
+	size_t		m_size;
+#if SIZEOF_VOIDP == 4
+	/** Pad the header size to a multiple of 64 bits on 32-bit systems,
+	so that the payload will be aligned to 64 bits. */
+	size_t		pad;
+#endif
+};
+
+#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP)
+static inline void ut_dontdump(void *ptr, size_t m_size, bool dontdump)
+{
+	ut_a(ptr != NULL);
+
+	if (dontdump && madvise(ptr, m_size, MADV_DONTDUMP)) {
+		ib::warn() << "Failed to set memory to " DONTDUMP_STR ": "
+			   << strerror(errno)
+			   << " ptr " << ptr
+			   << " size " << m_size;
+	}
+}
+
+static inline void ut_dodump(void* ptr, size_t m_size)
+{
+	if (ptr && madvise(ptr, m_size, MADV_DODUMP)) {
+		ib::warn() << "Failed to set memory to " DODUMP_STR ": "
+			   << strerror(errno)
+			   << " ptr " << ptr
+			   << " size " << m_size;
+	}
+}
+#else
+static inline void ut_dontdump(void *, size_t, bool) {}
+static inline void ut_dodump(void*, size_t) {}
+#endif
+
+/** Allocator class for allocating memory from inside std::* containers.
+@tparam	T		type of allocated object
+@tparam oom_fatal	whether to commit suicide when running out of memory */
+template <class T, bool oom_fatal = true>
+class ut_allocator {
+public:
+	typedef T*		pointer;
+	typedef const T*	const_pointer;
+	typedef T&		reference;
+	typedef const T&	const_reference;
+	typedef T		value_type;
+	typedef size_t		size_type;
+	typedef ptrdiff_t	difference_type;
+
+#ifdef UNIV_PFS_MEMORY
+	/** Default constructor. */
+	explicit
+	ut_allocator(PSI_memory_key key = PSI_NOT_INSTRUMENTED)
+		: m_key(key)
+	{
+	}
+#else
+	ut_allocator() = default;
+	ut_allocator(PSI_memory_key) {}
+#endif /* UNIV_PFS_MEMORY */
+
+	/** Constructor from allocator of another type. */
+	template <class U>
+	ut_allocator(const ut_allocator<U>&
+#ifdef UNIV_PFS_MEMORY
+		     other
+#endif
+		     )
+	{
+#ifdef UNIV_PFS_MEMORY
+		const PSI_memory_key other_key = other.get_mem_key();
+
+		m_key = (other_key != mem_key_std)
+			? other_key
+			: PSI_NOT_INSTRUMENTED;
+#endif /* UNIV_PFS_MEMORY */
+	}
+
+	/** Return the maximum number of objects that can be allocated by
+	this allocator. */
+	size_type
+	max_size() const
+	{
+		const size_type	s_max = std::numeric_limits<size_type>::max();
+
+#ifdef UNIV_PFS_MEMORY
+		return((s_max - sizeof(ut_new_pfx_t)) / sizeof(T));
+#else
+		return(s_max / sizeof(T));
+#endif /* UNIV_PFS_MEMORY */
+	}
+
+	pointer allocate(size_type n) { return allocate(n, NULL, INVALID_AUTOEVENT_IDX); }
+
+	/** Allocate a chunk of memory that can hold 'n_elements' objects of
+	type 'T' and trace the allocation.
+	If the allocation fails this method may throw an exception. This
+	is mandated by the standard and if it returns NULL instead, then
+	STL containers that use it (e.g. std::vector) may get confused.
+	After successfull allocation the returned pointer must be passed
+	to ut_allocator::deallocate() when no longer needed.
+	@param[in]	n_elements	number of elements
+	@param[in]	set_to_zero	if true, then the returned memory is
+	initialized with 0x0 bytes.
+	@param[in]	throw_on_error	if true, raize exception if too big
+	@return pointer to the allocated memory */
+	pointer
+	allocate(
+		size_type	n_elements,
+		const_pointer,
+		uint32_t
+#ifdef UNIV_PFS_MEMORY
+		autoevent_idx /* AUTOEVENT_IDX of the caller */
+#endif
+		,
+		bool		set_to_zero = false,
+		bool		throw_on_error = true)
+	{
+		if (n_elements == 0) {
+			return(NULL);
+		}
+
+		if (n_elements > max_size()) {
+			if (throw_on_error) {
+				throw(std::bad_alloc());
+			} else {
+				return(NULL);
+			}
+		}
+
+		void*	ptr;
+		size_t	total_bytes = n_elements * sizeof(T);
+
+#ifdef UNIV_PFS_MEMORY
+		/* The header size must not ruin the 64-bit alignment
+		on 32-bit systems. Some allocated structures use
+		64-bit fields. */
+		ut_ad((sizeof(ut_new_pfx_t) & 7) == 0);
+		total_bytes += sizeof(ut_new_pfx_t);
+#endif /* UNIV_PFS_MEMORY */
+
+		for (size_t retries = 1; ; retries++) {
+
+			if (set_to_zero) {
+				ptr = calloc(1, total_bytes);
+			} else {
+				ptr = malloc(total_bytes);
+			}
+
+			if (ptr != NULL || retries >= alloc_max_retries) {
+				break;
+			}
+
+			std::this_thread::sleep_for(std::chrono::seconds(1));
+		}
+
+		if (ptr == NULL) {
+			ib::fatal_or_error(oom_fatal)
+				<< "Cannot allocate " << total_bytes
+				<< " bytes of memory after "
+				<< alloc_max_retries << " retries over "
+				<< alloc_max_retries << " seconds. OS error: "
+				<< strerror(errno) << " (" << errno << "). "
+				<< OUT_OF_MEMORY_MSG;
+			if (throw_on_error) {
+				throw(std::bad_alloc());
+			} else {
+				return(NULL);
+			}
+		}
+
+#ifdef UNIV_PFS_MEMORY
+		ut_new_pfx_t*	pfx = static_cast<ut_new_pfx_t*>(ptr);
+
+		allocate_trace(total_bytes, autoevent_idx, pfx);
+
+		return(reinterpret_cast<pointer>(pfx + 1));
+#else
+		return(reinterpret_cast<pointer>(ptr));
+#endif /* UNIV_PFS_MEMORY */
+	}
+
+	/** Free a memory allocated by allocate() and trace the deallocation.
+	@param[in,out]	ptr		pointer to memory to free */
+	void deallocate(pointer ptr, size_type n_elements = 0)
+	{
+#ifdef UNIV_PFS_MEMORY
+		if (ptr == NULL) {
+			return;
+		}
+
+		ut_new_pfx_t*	pfx = reinterpret_cast<ut_new_pfx_t*>(ptr) - 1;
+
+		deallocate_trace(pfx);
+
+		free(pfx);
+#else
+		free(ptr);
+#endif /* UNIV_PFS_MEMORY */
+	}
+
+	/** Create an object of type 'T' using the value 'val' over the
+	memory pointed by 'p'. */
+	void
+	construct(
+		pointer		p,
+		const T&	val)
+	{
+		new(p) T(val);
+	}
+
+	/** Destroy an object pointed by 'p'. */
+	void
+	destroy(
+		pointer	p)
+	{
+		p->~T();
+	}
+
+	/** Return the address of an object. */
+	pointer
+	address(
+		reference	x) const
+	{
+		return(&x);
+	}
+
+	/** Return the address of a const object. */
+	const_pointer
+	address(
+		const_reference	x) const
+	{
+		return(&x);
+	}
+
+	template <class U>
+	struct rebind {
+		typedef ut_allocator<U>	other;
+	};
+
+	/* The following are custom methods, not required by the standard. */
+
+#ifdef UNIV_PFS_MEMORY
+
+	/** realloc(3)-like method.
+	The passed in ptr must have been returned by allocate() and the
+	pointer returned by this method must be passed to deallocate() when
+	no longer needed.
+	@param[in,out]	ptr		old pointer to reallocate
+	@param[in]	n_elements	new number of elements to allocate
+	@param[in]	file		file name of the caller
+	@return newly allocated memory */
+	pointer
+	reallocate(
+		void*		ptr,
+		size_type	n_elements,
+		uint32_t	autoevent_idx)
+	{
+		if (n_elements == 0) {
+			deallocate(static_cast<pointer>(ptr));
+			return(NULL);
+		}
+
+		if (ptr == NULL) {
+			return(allocate(n_elements, NULL, autoevent_idx, false, false));
+		}
+
+		if (n_elements > max_size()) {
+			return(NULL);
+		}
+
+		ut_new_pfx_t*	pfx_old;
+		ut_new_pfx_t*	pfx_new;
+		size_t		total_bytes;
+
+		pfx_old = reinterpret_cast<ut_new_pfx_t*>(ptr) - 1;
+
+		total_bytes = n_elements * sizeof(T) + sizeof(ut_new_pfx_t);
+
+		for (size_t retries = 1; ; retries++) {
+
+			pfx_new = static_cast<ut_new_pfx_t*>(
+				realloc(pfx_old, total_bytes));
+
+			if (pfx_new != NULL || retries >= alloc_max_retries) {
+				break;
+			}
+
+			std::this_thread::sleep_for(std::chrono::seconds(1));
+		}
+
+		if (pfx_new == NULL) {
+			ib::fatal_or_error(oom_fatal)
+				<< "Cannot reallocate " << total_bytes
+				<< " bytes of memory after "
+				<< alloc_max_retries << " retries over "
+				<< alloc_max_retries << " seconds. OS error: "
+				<< strerror(errno) << " (" << errno << "). "
+				<< OUT_OF_MEMORY_MSG;
+			return(NULL);
+		}
+
+		/* pfx_new still contains the description of the old block
+		that was presumably freed by realloc(). */
+		deallocate_trace(pfx_new);
+
+		/* pfx_new is set here to describe the new block. */
+		allocate_trace(total_bytes, autoevent_idx, pfx_new);
+
+		return(reinterpret_cast<pointer>(pfx_new + 1));
+	}
+
+	/** Allocate, trace the allocation and construct 'n_elements' objects
+	of type 'T'. If the allocation fails or if some of the constructors
+	throws an exception, then this method will return NULL. It does not
+	throw exceptions. After successfull completion the returned pointer
+	must be passed to delete_array() when no longer needed.
+	@param[in]	n_elements	number of elements to allocate
+	@param[in]	file		file name of the caller
+	@return pointer to the first allocated object or NULL */
+	pointer
+	new_array(
+		size_type	n_elements,
+		uint32_t autoevent_idx
+		)
+	{
+		T*	p = allocate(n_elements, NULL, autoevent_idx, false, false);
+
+		if (p == NULL) {
+			return(NULL);
+		}
+
+		T*		first = p;
+		size_type	i;
+
+		try {
+			for (i = 0; i < n_elements; i++) {
+				new(p) T;
+				++p;
+			}
+		} catch (...) {
+			for (size_type j = 0; j < i; j++) {
+				--p;
+				p->~T();
+			}
+
+			deallocate(first);
+
+			throw;
+		}
+
+		return(first);
+	}
+
+	/** Destroy, deallocate and trace the deallocation of an array created
+	by new_array().
+	@param[in,out]	ptr	pointer to the first object in the array */
+	void
+	delete_array(
+		T*	ptr)
+	{
+		if (ptr == NULL) {
+			return;
+		}
+
+		const size_type	n_elements = n_elements_allocated(ptr);
+
+		T*		p = ptr + n_elements - 1;
+
+		for (size_type i = 0; i < n_elements; i++) {
+			p->~T();
+			--p;
+		}
+
+		deallocate(ptr);
+	}
+
+#endif /* UNIV_PFS_MEMORY */
+
+	/** Allocate a large chunk of memory that can hold 'n_elements'
+	objects of type 'T' and trace the allocation.
+	@param[in]	n_elements	number of elements
+	@param[in]	dontdump	if true, advise the OS is not to core
+	dump this memory.
+	@param[out]	pfx		storage for the description of the
+	allocated memory. The caller must provide space for this one and keep
+	it until the memory is no longer needed and then pass it to
+	deallocate_large().
+	@return pointer to the allocated memory or NULL */
+	pointer
+	allocate_large(
+		size_type	n_elements,
+		ut_new_pfx_t*	pfx,
+		bool		dontdump = false)
+	{
+		if (n_elements == 0 || n_elements > max_size()) {
+			return(NULL);
+		}
+
+		ulint	n_bytes = n_elements * sizeof(T);
+
+		pointer	ptr = reinterpret_cast<pointer>(
+			my_large_malloc(&n_bytes, MYF(0)));
+
+		if (ptr == NULL) {
+			return NULL;
+		}
+
+		ut_dontdump(ptr, n_bytes, dontdump);
+
+		if (pfx != NULL) {
+#ifdef UNIV_PFS_MEMORY
+			allocate_trace(n_bytes, 0, pfx);
+#endif /* UNIV_PFS_MEMORY */
+			pfx->m_size = n_bytes;
+		}
+
+		os_total_large_mem_allocated += n_bytes;
+
+		return(ptr);
+	}
+
+	pointer
+	allocate_large_dontdump(
+		size_type	n_elements,
+		ut_new_pfx_t*	pfx)
+	{
+		return allocate_large(n_elements, pfx, true);
+	}
+	/** Free a memory allocated by allocate_large() and trace the
+	deallocation.
+	@param[in,out]	ptr	pointer to memory to free
+	@param[in]	pfx	descriptor of the memory, as returned by
+	allocate_large(). */
+	void
+	deallocate_large(
+		pointer			ptr,
+		const ut_new_pfx_t*	pfx)
+	{
+		size_t size = pfx->m_size;
+#ifdef UNIV_PFS_MEMORY
+		if (pfx) {
+			deallocate_trace(pfx);
+		}
+#endif /* UNIV_PFS_MEMORY */
+		os_total_large_mem_allocated -= size;
+
+		my_large_free(ptr, size);
+	}
+
+	void
+	deallocate_large_dodump(
+		pointer			ptr,
+		const ut_new_pfx_t*	pfx)
+	{
+		ut_dodump(ptr, pfx->m_size);
+		deallocate_large(ptr, pfx);
+	}
+
+#ifdef UNIV_PFS_MEMORY
+	/** Get the performance schema key to use for tracing allocations.
+	@param[in]	file	file name of the caller or NULL if unknown
+	@return performance schema key */
+	PSI_memory_key
+	get_mem_key(
+		uint32_t autoevent_idx = INVALID_AUTOEVENT_IDX) const
+	{
+		if (m_key != PSI_NOT_INSTRUMENTED) {
+			return(m_key);
+		}
+
+		if (autoevent_idx == INVALID_AUTOEVENT_IDX) {
+			return(mem_key_std);
+		}
+		const PSI_memory_key	key = ut_new_get_key_by_file(autoevent_idx);
+
+		if (key != PSI_NOT_INSTRUMENTED) {
+			return(key);
+		}
+
+		return(mem_key_other);
+	}
+
+private:
+
+	/** Retrieve the size of a memory block allocated by new_array().
+	@param[in]	ptr	pointer returned by new_array().
+	@return size of memory block */
+	size_type
+	n_elements_allocated(
+		const_pointer	ptr)
+	{
+		const ut_new_pfx_t*	pfx
+			= reinterpret_cast<const ut_new_pfx_t*>(ptr) - 1;
+
+		const size_type		user_bytes
+			= pfx->m_size - sizeof(ut_new_pfx_t);
+
+		ut_ad(user_bytes % sizeof(T) == 0);
+
+		return(user_bytes / sizeof(T));
+	}
+
+	/** Trace a memory allocation.
+	After the accounting, the data needed for tracing the deallocation
+	later is written into 'pfx'.
+	The PFS event name is picked on the following criteria:
+	1. If key (!= PSI_NOT_INSTRUMENTED) has been specified when constructing
+	   this ut_allocator object, then the name associated with that key will
+	   be used (this is the recommended approach for new code)
+	2. Otherwise, if "file" is NULL, then the name associated with
+	   mem_key_std will be used
+	3. Otherwise, if an entry is found by ut_new_get_key_by_file(), that
+	   corresponds to "file", that will be used (see ut_new_boot())
+	4. Otherwise, the name associated with mem_key_other will be used.
+	@param[in]	size	number of bytes that were allocated
+	@param[in]	autoevent_idx	autoevent_idx of the caller
+	@param[out]	pfx	placeholder to store the info which will be
+	needed when freeing the memory */
+	void
+	allocate_trace(
+		size_t		size,
+		const uint32_t autoevent_idx,
+		ut_new_pfx_t*	pfx)
+	{
+		const PSI_memory_key	key = get_mem_key(autoevent_idx);
+
+		pfx->m_key = PSI_MEMORY_CALL(memory_alloc)(key, size, & pfx->m_owner);
+		pfx->m_size = size;
+	}
+
+	/** Trace a memory deallocation.
+	@param[in]	pfx	info for the deallocation */
+	void
+	deallocate_trace(
+		const ut_new_pfx_t*	pfx)
+	{
+		PSI_MEMORY_CALL(memory_free)(pfx->m_key, pfx->m_size, pfx->m_owner);
+	}
+
+	/** Performance schema key. */
+	PSI_memory_key	m_key;
+
+#endif /* UNIV_PFS_MEMORY */
+
+private:
+
+	/** Assignment operator, not used, thus disabled (private). */
+	template <class U>
+	void
+	operator=(
+		const ut_allocator<U>&);
+};
+
+/** Compare two allocators of the same type.
+As long as the type of A1 and A2 is the same, a memory allocated by A1
+could be freed by A2 even if the pfs mem key is different. */
+template <typename T>
+inline
+bool
+operator==(const ut_allocator<T>&, const ut_allocator<T>&) { return(true); }
+
+/** Compare two allocators of the same type. */
+template <typename T>
+inline
+bool
+operator!=(
+	const ut_allocator<T>&	lhs,
+	const ut_allocator<T>&	rhs)
+{
+	return(!(lhs == rhs));
+}
+
+#ifdef UNIV_PFS_MEMORY
+
+/*
+ constexpr trickery ahead.
+
+ Compute AUTOEVENT_IDX at compile time.
+ (index in the auto_event_names array, corresponding to basename of __FILE__)
+
+ The tricks are necessary to reduce the cost of lookup the
+ PSI_memory_key for auto event.
+*/
+
+static constexpr const char* cexpr_basename_helper(const char* s, const char* last_slash)
+{
+  return
+    *s == '\0' ? last_slash :
+    *s == '/' || *s == '\\' ? cexpr_basename_helper(s + 1, s + 1) :
+    cexpr_basename_helper(s + 1, last_slash);
+}
+
+static constexpr const char* cexpr_basename(const char* filename)
+{
+  return cexpr_basename_helper(filename, filename);
+}
+
+static constexpr bool cexpr_strequal_ignore_dot(const char* a, const char* b)
+{
+  return  *a == 0 || *a == '.' ? (*b == 0 || *b == '.')
+    : *a == *b ? cexpr_strequal_ignore_dot(a + 1, b + 1) : false;
+}
+
+constexpr const char* const auto_event_names[] =
+{
+  "btr0btr",
+  "btr0buf",
+  "btr0bulk",
+  "btr0cur",
+  "btr0pcur",
+  "btr0sea",
+  "buf0buf",
+  "buf0dblwr",
+  "buf0dump",
+  "buf0lru",
+  "buf0rea",
+  "dict0dict",
+  "dict0mem",
+  "dict0stats",
+  "eval0eval",
+  "fil0crypt",
+  "fil0fil",
+  "fsp0file",
+  "fts0ast",
+  "fts0blex",
+  "fts0config",
+  "fts0file",
+  "fts0fts",
+  "fts0opt",
+  "fts0pars",
+  "fts0que",
+  "fts0sql",
+  "fts0tlex",
+  "gis0sea",
+  "ha_innodb",
+  "handler0alter",
+  "hash0hash",
+  "i_s",
+  "lexyy",
+  "lock0lock",
+  "mem0mem",
+  "os0file",
+  "pars0lex",
+  "rem0rec",
+  "row0ftsort",
+  "row0import",
+  "row0log",
+  "row0merge",
+  "row0mysql",
+  "row0sel",
+  "srv0start",
+  "trx0i_s",
+  "trx0i_s",
+  "trx0roll",
+  "trx0rseg",
+  "trx0seg",
+  "trx0trx",
+  "trx0undo",
+  "ut0list",
+  "ut0mem",
+  "ut0new",
+  "ut0pool",
+  "ut0rbt",
+  "ut0wqueue",
+  "xtrabackup",
+  nullptr
+};
+
+constexpr uint32_t cexpr_lookup_auto_event_name(const char* name, uint32_t idx = 0)
+{
+  return !auto_event_names[idx] ? INVALID_AUTOEVENT_IDX :
+    cexpr_strequal_ignore_dot(name, auto_event_names[idx]) ? idx :
+    cexpr_lookup_auto_event_name(name, idx + 1);
+}
+
+/*
+ The AUTOEVENT_IDX macro.
+
+ Note, that there is a static_assert that checks whether
+ basename of the __FILE is not registered in the auto_event_names array.
+ If you run into this assert, add the basename to the array.
+
+ Weird looking lambda is used to force the evaluation at the compile time.
+*/
+#define AUTOEVENT_IDX []()\
+{\
+  constexpr auto idx = cexpr_lookup_auto_event_name(cexpr_basename(__FILE__)); \
+  static_assert(idx != INVALID_AUTOEVENT_IDX, "auto_event_names contains no entry for " __FILE__);\
+  return idx; \
+}()
+
+
+/** Allocate, trace the allocation and construct an object.
+Use this macro instead of 'new' within InnoDB.
+For example: instead of
+	Foo*	f = new Foo(args);
+use:
+	Foo*	f = UT_NEW(Foo(args), mem_key_some);
+Upon failure to allocate the memory, this macro may return NULL. It
+will not throw exceptions. After successfull allocation the returned
+pointer must be passed to UT_DELETE() when no longer needed.
+@param[in]	expr	any expression that could follow "new"
+@param[in]	key	performance schema memory tracing key
+@return pointer to the created object or NULL */
+#define UT_NEW(expr, key) \
+	/* Placement new will return NULL and not attempt to construct an
+	object if the passed in pointer is NULL, e.g. if allocate() has
+	failed to allocate memory and has returned NULL. */ \
+	::new(ut_allocator<byte>(key).allocate( \
+		sizeof expr, NULL, AUTOEVENT_IDX, false, false)) expr
+
+/** Allocate, trace the allocation and construct an object.
+Use this macro instead of 'new' within InnoDB and instead of UT_NEW()
+when creating a dedicated memory key is not feasible.
+For example: instead of
+	Foo*	f = new Foo(args);
+use:
+	Foo*	f = UT_NEW_NOKEY(Foo(args));
+Upon failure to allocate the memory, this macro may return NULL. It
+will not throw exceptions. After successfull allocation the returned
+pointer must be passed to UT_DELETE() when no longer needed.
+@param[in]	expr	any expression that could follow "new"
+@return pointer to the created object or NULL */
+#define UT_NEW_NOKEY(expr)	UT_NEW(expr, PSI_NOT_INSTRUMENTED)
+
+/** Destroy, deallocate and trace the deallocation of an object created by
+UT_NEW() or UT_NEW_NOKEY().
+We can't instantiate ut_allocator without having the type of the object, thus
+we redirect this to a templated function. */
+#define UT_DELETE(ptr)		ut_delete(ptr)
+
+
+/** Destroy and account object created by UT_NEW() or UT_NEW_NOKEY().
+@param[in,out]	ptr	pointer to the object */
+template <typename T>
+inline
+void
+ut_delete(
+	T*	ptr)
+{
+	if (ptr == NULL) {
+		return;
+	}
+
+	ut_allocator<T>	allocator;
+
+	allocator.destroy(ptr);
+	allocator.deallocate(ptr);
+}
+
+/** Allocate and account 'n_elements' objects of type 'type'.
+Use this macro to allocate memory within InnoDB instead of 'new[]'.
+The returned pointer must be passed to UT_DELETE_ARRAY().
+@param[in]	type		type of objects being created
+@param[in]	n_elements	number of objects to create
+@param[in]	key		performance schema memory tracing key
+@return pointer to the first allocated object or NULL */
+#define UT_NEW_ARRAY(type, n_elements, key) \
+	ut_allocator<type>(key).new_array(n_elements, AUTOEVENT_IDX)
+
+/** Allocate and account 'n_elements' objects of type 'type'.
+Use this macro to allocate memory within InnoDB instead of 'new[]' and
+instead of UT_NEW_ARRAY() when it is not feasible to create a dedicated key.
+@param[in]	type		type of objects being created
+@param[in]	n_elements	number of objects to create
+@return pointer to the first allocated object or NULL */
+#define UT_NEW_ARRAY_NOKEY(type, n_elements) \
+	UT_NEW_ARRAY(type, n_elements, PSI_NOT_INSTRUMENTED)
+
+/** Destroy, deallocate and trace the deallocation of an array created by
+UT_NEW_ARRAY() or UT_NEW_ARRAY_NOKEY().
+We can't instantiate ut_allocator without having the type of the object, thus
+we redirect this to a templated function. */
+#define UT_DELETE_ARRAY(ptr)	ut_delete_array(ptr)
+
+/** Destroy and account objects created by UT_NEW_ARRAY() or
+UT_NEW_ARRAY_NOKEY().
+@param[in,out]	ptr	pointer to the first object in the array */
+template <typename T>
+inline
+void
+ut_delete_array(
+	T*	ptr)
+{
+	ut_allocator<T>().delete_array(ptr);
+}
+
+#define ut_malloc(n_bytes, key)		static_cast<void*>( \
+	ut_allocator<byte>(key).allocate( \
+		n_bytes, NULL, AUTOEVENT_IDX, false, false))
+
+#define ut_malloc_dontdump(n_bytes, key) static_cast<void*>( \
+	ut_allocator<byte>(key).allocate_large( \
+		n_bytes, NULL, true))
+
+#define ut_zalloc(n_bytes, key)		static_cast<void*>( \
+	ut_allocator<byte>(key).allocate( \
+		n_bytes, NULL, AUTOEVENT_IDX, true, false))
+
+#define ut_malloc_nokey(n_bytes)	static_cast<void*>( \
+	ut_allocator<byte>(PSI_NOT_INSTRUMENTED).allocate( \
+		n_bytes, NULL, AUTOEVENT_IDX, false, false))
+
+#define ut_zalloc_nokey(n_bytes)	static_cast<void*>( \
+	ut_allocator<byte>(PSI_NOT_INSTRUMENTED).allocate( \
+		n_bytes, NULL, AUTOEVENT_IDX, true, false))
+
+#define ut_zalloc_nokey_nofatal(n_bytes)	static_cast<void*>( \
+	ut_allocator<byte, false>(PSI_NOT_INSTRUMENTED).allocate( \
+		n_bytes, NULL, AUTOEVENT_IDX, true, false))
+
+#define ut_realloc(ptr, n_bytes)	static_cast<void*>( \
+	ut_allocator<byte>(PSI_NOT_INSTRUMENTED).reallocate( \
+		ptr, n_bytes, AUTOEVENT_IDX))
+
+#define ut_free(ptr)	ut_allocator<byte>(PSI_NOT_INSTRUMENTED).deallocate( \
+	reinterpret_cast<byte*>(ptr))
+
+#else /* UNIV_PFS_MEMORY */
+
+/* Fallbacks when memory tracing is disabled at compile time. */
+
+#define UT_NEW(expr, key)		::new(std::nothrow) expr
+#define UT_NEW_NOKEY(expr)		::new(std::nothrow) expr
+#define UT_DELETE(ptr)			::delete ptr
+
+#define UT_NEW_ARRAY(type, n_elements, key) \
+	::new(std::nothrow) type[n_elements]
+
+#define UT_NEW_ARRAY_NOKEY(type, n_elements) \
+	::new(std::nothrow) type[n_elements]
+
+#define UT_DELETE_ARRAY(ptr)		::delete[] ptr
+
+#define ut_malloc(n_bytes, key)		::malloc(n_bytes)
+
+#define ut_zalloc(n_bytes, key)		::calloc(1, n_bytes)
+
+#define ut_malloc_nokey(n_bytes)	::malloc(n_bytes)
+
+static inline void *ut_malloc_dontdump(size_t n_bytes, ...)
+{
+	void *ptr = my_large_malloc(&n_bytes, MYF(0));
+
+	ut_dontdump(ptr, n_bytes, true);
+
+	if (ptr) {
+		os_total_large_mem_allocated += n_bytes;
+	}
+	return ptr;
+}
+
+#define ut_zalloc_nokey(n_bytes)	::calloc(1, n_bytes)
+
+#define ut_zalloc_nokey_nofatal(n_bytes)	::calloc(1, n_bytes)
+
+#define ut_realloc(ptr, n_bytes)	::realloc(ptr, n_bytes)
+
+#define ut_free(ptr)			::free(ptr)
+
+#endif /* UNIV_PFS_MEMORY */
+
+static inline void ut_free_dodump(void *ptr, size_t size)
+{
+	ut_dodump(ptr, size);
+	os_total_large_mem_allocated -= size;
+	my_large_free(ptr, size);
+}
+
+#endif /* ut0new_h */
diff --git a/storage/innobase/include/ut0pool.h b/storage/innobase/include/ut0pool.h
new file mode 100644
index 00000000..aa0cfb9e
--- /dev/null
+++ b/storage/innobase/include/ut0pool.h
@@ -0,0 +1,365 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0pool.h
+Object pool.
+
+Created 2012-Feb-26 Sunny Bains
+***********************************************************************/
+
+#ifndef ut0pool_h
+#define ut0pool_h
+
+#include <vector>
+#include <queue>
+#include <functional>
+
+#include <my_global.h>
+
+/** Allocate the memory for the object in blocks. We keep the objects sorted
+on pointer so that they are closer together in case they have to be iterated
+over in a list. */
+template <typename Type, typename Factory, typename LockStrategy>
+struct Pool {
+
+	typedef Type value_type;
+
+	struct Element {
+		Pool*		m_pool;
+		value_type	m_type;
+	};
+
+	/** Constructor
+	@param size size of the memory block */
+	Pool(size_t size)
+		:
+		m_end(),
+		m_start(),
+		m_size(size),
+		m_last()
+	{
+		ut_ad(ut_is_2pow(size));
+		ut_a(size >= sizeof(Element));
+		static_assert(!(sizeof(Element) % CPU_LEVEL1_DCACHE_LINESIZE),
+			      "alignment");
+
+		m_lock_strategy.create();
+
+		ut_a(m_start == 0);
+
+		m_start = static_cast<Element*>(
+			aligned_malloc(m_size, CPU_LEVEL1_DCACHE_LINESIZE));
+		memset_aligned<CPU_LEVEL1_DCACHE_LINESIZE>(
+			m_start, 0, m_size);
+
+		m_last = m_start;
+
+		m_end = &m_start[m_size / sizeof *m_start];
+
+		/* Note: Initialise only a small subset, even though we have
+		allocated all the memory. This is required only because PFS
+		(MTR) results change if we instantiate too many mutexes up
+		front. */
+
+		init(ut_min(size_t(16), size_t(m_end - m_start)));
+
+		ut_ad(m_pqueue.size() <= size_t(m_last - m_start));
+	}
+
+	/** Destructor */
+	~Pool()
+	{
+		m_lock_strategy.destroy();
+
+		for (Element* elem = m_start; elem != m_last; ++elem) {
+
+			ut_ad(elem->m_pool == this);
+			Factory::destroy(&elem->m_type);
+		}
+
+		IF_WIN(_aligned_free,free)(m_start);
+		m_end = m_last = m_start = 0;
+		m_size = 0;
+	}
+
+	/** Get an object from the pool.
+	@retrun a free instance or NULL if exhausted. */
+	Type*	get()
+	{
+		Element*	elem;
+
+		m_lock_strategy.enter();
+
+		if (!m_pqueue.empty()) {
+
+			elem = m_pqueue.top();
+			m_pqueue.pop();
+
+		} else if (m_last < m_end) {
+
+			/* Initialise the remaining elements. */
+			init(size_t(m_end - m_last));
+
+			ut_ad(!m_pqueue.empty());
+
+			elem = m_pqueue.top();
+			m_pqueue.pop();
+		} else {
+			elem = NULL;
+		}
+
+		m_lock_strategy.exit();
+		return elem ? &elem->m_type : NULL;
+	}
+
+	/** Add the object to the pool.
+	@param ptr object to free */
+	static void mem_free(value_type* ptr)
+	{
+		Element*	elem;
+		byte*		p = reinterpret_cast<byte*>(ptr + 1);
+
+		elem = reinterpret_cast<Element*>(p - sizeof(*elem));
+
+		elem->m_pool->m_lock_strategy.enter();
+
+		elem->m_pool->putl(elem);
+
+		elem->m_pool->m_lock_strategy.exit();
+	}
+
+protected:
+	// Disable copying
+	Pool(const Pool&);
+	Pool& operator=(const Pool&);
+
+private:
+
+	/* We only need to compare on pointer address. */
+	typedef std::priority_queue<
+		Element*,
+		std::vector<Element*, ut_allocator<Element*> >,
+		std::greater<Element*> >	pqueue_t;
+
+	/** Release the object to the free pool
+	@param elem element to free */
+	void putl(Element* elem)
+	{
+		ut_ad(elem >= m_start && elem < m_last);
+		m_pqueue.push(elem);
+	}
+
+	/** Initialise the elements.
+	@param n_elems Number of elements to initialise */
+	void init(size_t n_elems)
+	{
+		ut_ad(size_t(m_end - m_last) >= n_elems);
+
+		for (size_t i = 0; i < n_elems; ++i, ++m_last) {
+
+			m_last->m_pool = this;
+			Factory::init(&m_last->m_type);
+			m_pqueue.push(m_last);
+		}
+
+		ut_ad(m_last <= m_end);
+	}
+
+private:
+	/** Pointer to the last element */
+	Element*		m_end;
+
+	/** Pointer to the first element */
+	Element*		m_start;
+
+	/** Size of the block in bytes */
+	size_t			m_size;
+
+	/** Upper limit of used space */
+	Element*		m_last;
+
+	/** Priority queue ordered on the pointer addresse. */
+	pqueue_t		m_pqueue;
+
+	/** Lock strategy to use */
+	LockStrategy		m_lock_strategy;
+};
+
+template <typename Pool, typename LockStrategy>
+struct PoolManager {
+
+	typedef Pool PoolType;
+	typedef typename PoolType::value_type value_type;
+
+	PoolManager(size_t size)
+		:
+		m_size(size)
+	{
+		create();
+	}
+
+	~PoolManager()
+	{
+		destroy();
+
+		ut_a(m_pools.empty());
+	}
+
+	/** Get an element from one of the pools.
+	@return instance or NULL if pool is empty. */
+	value_type* get()
+	{
+		size_t		index = 0;
+		size_t		delay = 1;
+		value_type*	ptr = NULL;
+
+		do {
+			m_lock_strategy.enter();
+
+			ut_ad(!m_pools.empty());
+
+			size_t	n_pools = m_pools.size();
+
+			PoolType*	pool = m_pools[index % n_pools];
+
+			m_lock_strategy.exit();
+
+			ptr = pool->get();
+
+			if (ptr == 0 && (index / n_pools) > 2) {
+
+				if (!add_pool(n_pools)) {
+
+					ib::error() << "Failed to allocate"
+						" memory for a pool of size "
+						<< m_size << " bytes. Will"
+						" wait for " << delay
+						<< " seconds for a thread to"
+						" free a resource";
+
+					/* There is nothing much we can do
+					except crash and burn, however lets
+					be a little optimistic and wait for
+					a resource to be freed. */
+					std::this_thread::sleep_for(
+						std::chrono::seconds(delay));
+
+					if (delay < 32) {
+						delay <<= 1;
+					}
+
+				} else {
+					delay = 1;
+				}
+			}
+
+			++index;
+
+		} while (ptr == NULL);
+
+		return(ptr);
+	}
+
+	static void mem_free(value_type* ptr)
+	{
+		PoolType::mem_free(ptr);
+	}
+
+private:
+	/** Add a new pool
+	@param n_pools Number of pools that existed when the add pool was
+			called.
+	@return true on success */
+	bool add_pool(size_t n_pools)
+	{
+		bool	added = false;
+
+		m_lock_strategy.enter();
+
+		if (n_pools < m_pools.size()) {
+			/* Some other thread already added a pool. */
+			added = true;
+		} else {
+			PoolType*	pool;
+
+			ut_ad(n_pools == m_pools.size());
+
+			pool = UT_NEW_NOKEY(PoolType(m_size));
+
+			if (pool != NULL) {
+				m_pools.push_back(pool);
+
+				ib::info() << "Number of transaction pools: "
+					<< m_pools.size();
+
+				added = true;
+			}
+		}
+
+		ut_ad(n_pools < m_pools.size() || !added);
+
+		m_lock_strategy.exit();
+
+		return(added);
+	}
+
+	/** Create the pool manager. */
+	void create()
+	{
+		ut_a(m_size > sizeof(value_type));
+		m_lock_strategy.create();
+
+		add_pool(0);
+	}
+
+	/** Release the resources. */
+	void destroy()
+	{
+		typename Pools::iterator it;
+		typename Pools::iterator end = m_pools.end();
+
+		for (it = m_pools.begin(); it != end; ++it) {
+			PoolType*	pool = *it;
+
+			UT_DELETE(pool);
+		}
+
+		m_pools.clear();
+
+		m_lock_strategy.destroy();
+	}
+private:
+	// Disable copying
+	PoolManager(const PoolManager&);
+	PoolManager& operator=(const PoolManager&);
+
+	typedef std::vector<PoolType*, ut_allocator<PoolType*> >	Pools;
+
+	/** Size of each block */
+	size_t		m_size;
+
+	/** Pools managed this manager */
+	Pools		m_pools;
+
+	/** Lock strategy to use */
+	LockStrategy		m_lock_strategy;
+};
+
+#endif /* ut0pool_h */
diff --git a/storage/innobase/include/ut0rbt.h b/storage/innobase/include/ut0rbt.h
new file mode 100644
index 00000000..38071165
--- /dev/null
+++ b/storage/innobase/include/ut0rbt.h
@@ -0,0 +1,254 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/******************************************************************//**
+@file include/ut0rbt.h
+Various utilities
+
+Created 2007-03-20 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_UT0RBT_H
+#define INNOBASE_UT0RBT_H
+
+#if !defined(IB_RBT_TESTING)
+#include "ut0mem.h"
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#define	ut_malloc	malloc
+#define	ut_free		free
+#define	ulint		unsigned long
+#define	ut_a(c)		assert(c)
+#define ut_error	assert(0)
+#define	ibool		unsigned int
+#define	TRUE		1
+#define	FALSE		0
+#endif
+
+struct ib_rbt_node_t;
+typedef void (*ib_rbt_print_node)(const ib_rbt_node_t* node);
+typedef int (*ib_rbt_compare)(const void* p1, const void* p2);
+typedef int (*ib_rbt_arg_compare)(const void*, const void* p1, const void* p2);
+
+/** Red black tree color types */
+enum ib_rbt_color_t {
+	IB_RBT_RED,
+	IB_RBT_BLACK
+};
+
+/** Red black tree node */
+struct ib_rbt_node_t {
+	ib_rbt_color_t	color;			/* color of this node */
+
+	ib_rbt_node_t*	left;			/* points left child */
+	ib_rbt_node_t*	right;			/* points right child */
+	ib_rbt_node_t*	parent;			/* points parent node */
+
+	char		value[1];		/* Data value */
+};
+
+/** Red black tree instance.*/
+struct	ib_rbt_t {
+	ib_rbt_node_t*	nil;			/* Black colored node that is
+						used as a sentinel. This is
+						pre-allocated too.*/
+
+	ib_rbt_node_t*	root;			/* Root of the tree, this is
+						pre-allocated and the first
+						data node is the left child.*/
+
+	ulint		n_nodes;		/* Total number of data nodes */
+
+	ib_rbt_compare	compare;		/* Fn. to use for comparison */
+	ib_rbt_arg_compare
+			compare_with_arg;	/* Fn. to use for comparison
+						with argument */
+	ulint		sizeof_value;		/* Sizeof the item in bytes */
+	void*		cmp_arg;		/* Compare func argument */
+};
+
+/** The result of searching for a key in the tree, this is useful for
+a speedy lookup and insert if key doesn't exist.*/
+struct ib_rbt_bound_t {
+	const ib_rbt_node_t*
+			last;			/* Last node visited */
+
+	int		result;			/* Result of comparing with
+						the last non-nil node that
+						was visited */
+};
+
+/* Size in elements (t is an rb tree instance) */
+#define rbt_size(t)	(t->n_nodes)
+
+/* Check whether the rb tree is empty (t is an rb tree instance) */
+#define rbt_empty(t)	(rbt_size(t) == 0)
+
+/* Get data value (t is the data type, n is an rb tree node instance) */
+#define rbt_value(t, n) ((t*) &n->value[0])
+
+/* Compare a key with the node value (t is tree, k is key, n is node)*/
+#define rbt_compare(t, k, n) (t->compare(k, n->value))
+
+/**********************************************************************//**
+Free an instance of  a red black tree */
+void
+rbt_free(
+/*=====*/
+	ib_rbt_t*	tree);			/*!< in: rb tree to free */
+/**********************************************************************//**
+Create an instance of a red black tree
+@return rb tree instance */
+ib_rbt_t*
+rbt_create(
+/*=======*/
+	size_t		sizeof_value,		/*!< in: size in bytes */
+	ib_rbt_compare	compare);		/*!< in: comparator */
+/**********************************************************************//**
+Create an instance of a red black tree, whose comparison function takes
+an argument
+@return rb tree instance */
+ib_rbt_t*
+rbt_create_arg_cmp(
+/*===============*/
+	size_t		sizeof_value,		/*!< in: size in bytes */
+	ib_rbt_arg_compare
+			compare,		/*!< in: comparator */
+	void*	cmp_arg);		/*!< in: compare fn arg */
+/**********************************************************************//**
+Delete a node from the red black tree, identified by key */
+ibool
+rbt_delete(
+/*=======*/
+						/* in: TRUE on success */
+	ib_rbt_t*	tree,			/* in: rb tree */
+	const void*	key);			/* in: key to delete */
+/**********************************************************************//**
+Remove a node from the red black tree, NOTE: This function will not delete
+the node instance, THAT IS THE CALLERS RESPONSIBILITY.
+@return the deleted node with the const. */
+ib_rbt_node_t*
+rbt_remove_node(
+/*============*/
+	ib_rbt_t*	tree,			/*!< in: rb tree */
+	const ib_rbt_node_t*
+			node);			/*!< in: node to delete, this
+						is a fudge and declared const
+						because the caller has access
+						only to const nodes.*/
+/**********************************************************************//**
+Add data to the red black tree, identified by key (no dups yet!)
+@return inserted node */
+const ib_rbt_node_t*
+rbt_insert(
+/*=======*/
+	ib_rbt_t*	tree,			/*!< in: rb tree */
+	const void*	key,			/*!< in: key for ordering */
+	const void*	value);			/*!< in: data that will be
+						copied to the node.*/
+/**********************************************************************//**
+Add a new node to the tree, useful for data that is pre-sorted.
+@return appended node */
+const ib_rbt_node_t*
+rbt_add_node(
+/*=========*/
+	ib_rbt_t*	tree,			/*!< in: rb tree */
+	ib_rbt_bound_t*	parent,			/*!< in: parent */
+	const void*	value);			/*!< in: this value is copied
+						to the node */
+/**********************************************************************//**
+Return the left most data node in the tree
+@return left most node */
+const ib_rbt_node_t*
+rbt_first(
+/*======*/
+	const ib_rbt_t*	tree);			/*!< in: rb tree */
+/**********************************************************************//**
+Return the right most data node in the tree
+@return right most node */
+const ib_rbt_node_t*
+rbt_last(
+/*=====*/
+	const ib_rbt_t*	tree);			/*!< in: rb tree */
+/**********************************************************************//**
+Return the next node from current.
+@return successor node to current that is passed in. */
+const ib_rbt_node_t*
+rbt_next(
+/*=====*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	const ib_rbt_node_t*			/* in: current node */
+			current);
+/**********************************************************************//**
+Return the prev node from current.
+@return precedessor node to current that is passed in */
+const ib_rbt_node_t*
+rbt_prev(
+/*=====*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	const ib_rbt_node_t*			/* in: current node */
+			current);
+/**********************************************************************//**
+Search for the key, a node will be retuned in parent.last, whether it
+was found or not. If not found then parent.last will contain the
+parent node for the possibly new key otherwise the matching node.
+@return result of last comparison */
+int
+rbt_search(
+/*=======*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	ib_rbt_bound_t*	parent,			/*!< in: search bounds */
+	const void*	key);			/*!< in: key to search */
+/**********************************************************************//**
+Search for the key, a node will be retuned in parent.last, whether it
+was found or not. If not found then parent.last will contain the
+parent node for the possibly new key otherwise the matching node.
+@return result of last comparison */
+int
+rbt_search_cmp(
+/*===========*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	ib_rbt_bound_t*	parent,			/*!< in: search bounds */
+	const void*	key,			/*!< in: key to search */
+	ib_rbt_compare	compare,		/*!< in: comparator */
+	ib_rbt_arg_compare
+			arg_compare);		/*!< in: fn to compare items
+						with argument */
+/**********************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+@return no. of recs merged */
+ulint
+rbt_merge_uniq(
+/*===========*/
+	ib_rbt_t*	dst,			/*!< in: dst rb tree */
+	const ib_rbt_t*	src);			/*!< in: src rb tree */
+#if defined UNIV_DEBUG || defined IB_RBT_TESTING
+/**********************************************************************//**
+Verify the integrity of the RB tree. For debugging. 0 failure else height
+of tree (in count of black nodes).
+@return TRUE if OK FALSE if tree invalid. */
+ibool
+rbt_validate(
+/*=========*/
+	const ib_rbt_t*	tree);			/*!< in: tree to validate */
+#endif /* UNIV_DEBUG || IB_RBT_TESTING */
+
+#endif /* INNOBASE_UT0RBT_H */
diff --git a/storage/innobase/include/ut0rnd.h b/storage/innobase/include/ut0rnd.h
new file mode 100644
index 00000000..511eb21f
--- /dev/null
+++ b/storage/innobase/include/ut0rnd.h
@@ -0,0 +1,128 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0rnd.h
+Random numbers and hashing
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0rnd_h
+#define ut0rnd_h
+
+#include "ut0byte.h"
+#include <my_sys.h>
+
+#ifndef UNIV_INNOCHECKSUM
+/** Seed value of ut_rnd_gen() */
+extern std::atomic<uint32_t> ut_rnd_current;
+
+/** @return a pseudo-random 32-bit number */
+inline uint32_t ut_rnd_gen()
+{
+  /* This is a Galois linear-feedback shift register.
+  https://en.wikipedia.org/wiki/Linear-feedback_shift_register#Galois_LFSRs
+  The generating primitive Galois Field polynomial is the Castagnoli
+  polynomial that was made popular by CRC-32C:
+  x^32+x^28+x^27+x^26+x^25+x^23+x^22+x^20+
+  x^19+x^18+x^14+x^13+x^11+x^10+x^9+x^8+x^6+1 */
+  const uint32_t crc32c= 0x1edc6f41;
+
+  uint32_t rnd= ut_rnd_current.load(std::memory_order_relaxed);
+
+  if (UNIV_UNLIKELY(rnd == 0))
+  {
+    rnd= static_cast<uint32_t>(my_interval_timer());
+    if (!rnd) rnd= 1;
+  }
+  else
+  {
+    bool lsb= rnd & 1;
+    rnd>>= 1;
+    if (lsb)
+      rnd^= crc32c;
+  }
+
+  ut_rnd_current.store(rnd, std::memory_order_relaxed);
+  return rnd;
+}
+
+/** @return a random number between 0 and n-1, inclusive */
+inline ulint ut_rnd_interval(ulint n)
+{
+  return n > 1 ? static_cast<ulint>(ut_rnd_gen() % n) : 0;
+}
+
+/*******************************************************//**
+The following function generates a hash value for a ulint integer
+to a hash table of size table_size, which should be a prime or some
+random number to work reliably.
+@return hash value */
+UNIV_INLINE
+ulint
+ut_hash_ulint(
+/*==========*/
+	ulint	 key,		/*!< in: value to be hashed */
+	ulint	 table_size);	/*!< in: hash table size */
+/*************************************************************//**
+Folds a 64-bit integer.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_ull(
+/*========*/
+	ib_uint64_t	d)	/*!< in: 64-bit integer */
+	MY_ATTRIBUTE((const));
+/***********************************************************//**
+Looks for a prime number slightly greater than the given argument.
+The prime is chosen so that it is not near any power of 2.
+@return prime */
+ulint
+ut_find_prime(
+/*==========*/
+	ulint	n)	/*!< in: positive number > 100 */
+	MY_ATTRIBUTE((const));
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Folds a pair of ulints.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+	ulint	n1,	/*!< in: ulint */
+	ulint	n2)	/*!< in: ulint */
+	MY_ATTRIBUTE((const));
+/*************************************************************//**
+Folds a binary string.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_binary(
+/*===========*/
+	const byte*	str,	/*!< in: string of bytes */
+	ulint		len)	/*!< in: length */
+	MY_ATTRIBUTE((pure));
+
+#include "ut0rnd.inl"
+
+#endif
diff --git a/storage/innobase/include/ut0rnd.inl b/storage/innobase/include/ut0rnd.inl
new file mode 100644
index 00000000..37da323f
--- /dev/null
+++ b/storage/innobase/include/ut0rnd.inl
@@ -0,0 +1,128 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0rnd.ic
+Random numbers and hashing
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+#define UT_HASH_RANDOM_MASK	1463735687
+#define UT_HASH_RANDOM_MASK2	1653893711
+
+#ifndef UNIV_INNOCHECKSUM
+
+/*******************************************************//**
+The following function generates a hash value for a ulint integer
+to a hash table of size table_size, which should be a prime
+or some random number for the hash table to work reliably.
+@return hash value */
+UNIV_INLINE
+ulint
+ut_hash_ulint(
+/*==========*/
+	ulint	 key,		/*!< in: value to be hashed */
+	ulint	 table_size)	/*!< in: hash table size */
+{
+	ut_ad(table_size);
+	key = key ^ UT_HASH_RANDOM_MASK2;
+
+	return(key % table_size);
+}
+
+/*************************************************************//**
+Folds a 64-bit integer.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_ull(
+/*========*/
+	ib_uint64_t	d)	/*!< in: 64-bit integer */
+{
+	return(ut_fold_ulint_pair((ulint) d & ULINT32_MASK,
+				  (ulint) (d >> 32)));
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Folds a pair of ulints.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+	ulint	n1,	/*!< in: ulint */
+	ulint	n2)	/*!< in: ulint */
+{
+	return(((((n1 ^ n2 ^ UT_HASH_RANDOM_MASK2) << 8) + n1)
+		^ UT_HASH_RANDOM_MASK) + n2);
+}
+
+/*************************************************************//**
+Folds a binary string.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_binary(
+/*===========*/
+	const byte*	str,	/*!< in: string of bytes */
+	ulint		len)	/*!< in: length */
+{
+	ulint		fold = 0;
+	const byte*	str_end	= str + (len & 0xFFFFFFF8);
+
+	ut_ad(str || !len);
+
+	while (str < str_end) {
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	}
+
+	switch (len & 0x7) {
+	case 7:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		/* fall through */
+	case 6:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		/* fall through */
+	case 5:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		/* fall through */
+	case 4:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		/* fall through */
+	case 3:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		/* fall through */
+	case 2:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		/* fall through */
+	case 1:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	}
+
+	return(fold);
+}
diff --git a/storage/innobase/include/ut0sort.h b/storage/innobase/include/ut0sort.h
new file mode 100644
index 00000000..4f1d4c04
--- /dev/null
+++ b/storage/innobase/include/ut0sort.h
@@ -0,0 +1,104 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0sort.h
+Sort utility
+
+Created 11/9/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0sort_h
+#define ut0sort_h
+
+/* This module gives a macro definition of the body of
+a standard sort function for an array of elements of any
+type. The comparison function is given as a parameter to
+the macro. The sort algorithm is mergesort which has logarithmic
+worst case.
+*/
+
+/*******************************************************************//**
+This macro expands to the body of a standard sort function.
+The sort function uses mergesort and must be defined separately
+for each type of array.
+Also the comparison function has to be defined individually
+for each array cell type. SORT_FUN is the sort function name.
+The function takes the array to be sorted (ARR),
+the array of auxiliary space (AUX_ARR) of same size,
+and the low (LOW), inclusive, and high (HIGH), noninclusive,
+limits for the sort interval as arguments.
+CMP_FUN is the comparison function name. It takes as arguments
+two elements from the array and returns 1, if the first is bigger,
+0 if equal, and -1 if the second bigger. */
+
+#define UT_SORT_FUNCTION_BODY(SORT_FUN, ARR, AUX_ARR, LOW, HIGH, CMP_FUN)\
+{\
+	ulint		ut_sort_mid77;\
+	ulint		ut_sort_i77;\
+	ulint		ut_sort_low77;\
+	ulint		ut_sort_high77;\
+\
+	ut_ad((LOW) < (HIGH));\
+	ut_ad(ARR);\
+	ut_ad(AUX_ARR);\
+\
+	if ((LOW) == (HIGH) - 1) {\
+		return;\
+	} else if ((LOW) == (HIGH) - 2) {\
+		if (CMP_FUN((ARR)[LOW], (ARR)[(HIGH) - 1]) > 0) {\
+			(AUX_ARR)[LOW] = (ARR)[LOW];\
+			(ARR)[LOW] = (ARR)[(HIGH) - 1];\
+			(ARR)[(HIGH) - 1] = (AUX_ARR)[LOW];\
+		}\
+		return;\
+	}\
+\
+	ut_sort_mid77 = ((LOW) + (HIGH)) / 2;\
+\
+	SORT_FUN((ARR), (AUX_ARR), (LOW), ut_sort_mid77);\
+	SORT_FUN((ARR), (AUX_ARR), ut_sort_mid77, (HIGH));\
+\
+	ut_sort_low77 = (LOW);\
+	ut_sort_high77 = ut_sort_mid77;\
+\
+	for (ut_sort_i77 = (LOW); ut_sort_i77 < (HIGH); ut_sort_i77++) {\
+\
+		if (ut_sort_low77 >= ut_sort_mid77) {\
+			(AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\
+			ut_sort_high77++;\
+		} else if (ut_sort_high77 >= (HIGH)) {\
+			(AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\
+			ut_sort_low77++;\
+		} else if (CMP_FUN((ARR)[ut_sort_low77],\
+				   (ARR)[ut_sort_high77]) > 0) {\
+			(AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\
+			ut_sort_high77++;\
+		} else {\
+			(AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\
+			ut_sort_low77++;\
+		}\
+	}\
+\
+	memcpy((void*) ((ARR) + (LOW)), (AUX_ARR) + (LOW),\
+	       ((HIGH) - (LOW)) * sizeof *(ARR));\
+}\
+
+
+#endif
+
diff --git a/storage/innobase/include/ut0stage.h b/storage/innobase/include/ut0stage.h
new file mode 100644
index 00000000..17fbd91b
--- /dev/null
+++ b/storage/innobase/include/ut0stage.h
@@ -0,0 +1,499 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ut/ut0stage.h
+Supplementary code to performance schema stage instrumentation.
+
+Created Nov 12, 2014 Vasil Dimov
+*******************************************************/
+
+#ifndef ut0stage_h
+#define ut0stage_h
+
+#include <algorithm>
+#include <math.h>
+
+#include "my_global.h" /* needed for headers from mysql/psi/ */
+
+#include "mysql/psi/mysql_stage.h" /* mysql_stage_inc_work_completed */
+#include "mysql/psi/psi.h" /* HAVE_PSI_STAGE_INTERFACE, PSI_stage_progress */
+
+#include "dict0mem.h" /* dict_index_t */
+#include "row0log.h" /* row_log_estimate_work() */
+#include "srv0srv.h" /* ut_stage_alter_t */
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+
+/** Class used to report ALTER TABLE progress via performance_schema.
+The only user of this class is the ALTER TABLE code and it calls the methods
+in the following order
+constructor
+begin_phase_read_pk()
+  multiple times:
+    n_pk_recs_inc() // once per record read
+    inc() // once per page read
+end_phase_read_pk()
+if any new indexes are being added, for each one:
+  begin_phase_sort()
+    multiple times:
+      inc() // once per record sorted
+  begin_phase_insert()
+    multiple times:
+      inc() // once per record inserted
+  being_phase_log_index()
+    multiple times:
+      inc() // once per log-block applied
+begin_phase_log_table()
+    multiple times:
+      inc() // once per log-block applied
+begin_phase_end()
+destructor
+
+This class knows the specifics of each phase and tries to increment the
+progress in an even manner across the entire ALTER TABLE lifetime. */
+class ut_stage_alter_t {
+public:
+	/** Constructor.
+	@param[in]	pk	primary key of the old table */
+	explicit
+	ut_stage_alter_t(
+		const dict_index_t*	pk)
+		:
+		m_progress(NULL),
+		m_pk(pk),
+		m_n_pk_recs(0),
+		m_n_pk_pages(0),
+		m_n_recs_processed(0),
+		m_cur_phase(NOT_STARTED)
+	{
+	}
+
+	/** Destructor. */
+	~ut_stage_alter_t();
+
+	/** Flag an ALTER TABLE start (read primary key phase).
+	@param[in]	n_sort_indexes	number of indexes that will be sorted
+	during ALTER TABLE, used for estimating the total work to be done */
+	void
+	begin_phase_read_pk(
+		ulint	n_sort_indexes);
+
+	/** Increment the number of records in PK (table) with 1.
+	This is used to get more accurate estimate about the number of
+	records per page which is needed because some phases work on
+	per-page basis while some work on per-record basis and we want
+	to get the progress as even as possible. */
+	void
+	n_pk_recs_inc();
+
+	/** Flag either one record or one page processed, depending on the
+	current phase.
+	@param[in]	inc_val	flag this many units processed at once */
+	void
+	inc(
+		ulint	inc_val = 1);
+
+	/** Flag the end of reading of the primary key.
+	Here we know the exact number of pages and records and calculate
+	the number of records per page and refresh the estimate. */
+	void
+	end_phase_read_pk();
+
+	/** Flag the beginning of the sort phase.
+	@param[in]	sort_multi_factor	since merge sort processes
+	one page more than once we only update the estimate once per this
+	many pages processed. */
+	void
+	begin_phase_sort(
+		double	sort_multi_factor);
+
+	/** Flag the beginning of the insert phase. */
+	void
+	begin_phase_insert();
+
+	/** Flag the beginning of the log index phase. */
+	void
+	begin_phase_log_index();
+
+	/** Flag the beginning of the log table phase. */
+	void
+	begin_phase_log_table();
+
+	/** Flag the beginning of the end phase. */
+	void
+	begin_phase_end();
+
+private:
+
+	/** Update the estimate of total work to be done. */
+	void
+	reestimate();
+
+	/** Change the current phase.
+	@param[in]	new_stage	pointer to the new stage to change to */
+	void
+	change_phase(
+		const PSI_stage_info*	new_stage);
+
+	/** Performance schema accounting object. */
+	PSI_stage_progress*	m_progress;
+
+	/** Old table PK. Used for calculating the estimate. */
+	const dict_index_t*	m_pk;
+
+	/** Number of records in the primary key (table), including delete
+	marked records. */
+	ulint			m_n_pk_recs;
+
+	/** Number of leaf pages in the primary key. */
+	ulint			m_n_pk_pages;
+
+	/** Estimated number of records per page in the primary key. */
+	double			m_n_recs_per_page;
+
+	/** Number of indexes that are being added. */
+	ulint			m_n_sort_indexes;
+
+	/** During the sort phase, increment the counter once per this
+	many pages processed. This is because sort processes one page more
+	than once. */
+	ulint			m_sort_multi_factor;
+
+	/** Number of records processed during sort & insert phases. We
+	need to increment the counter only once page, or once per
+	recs-per-page records. */
+	ulint			m_n_recs_processed;
+
+	/** Current phase. */
+	enum {
+		NOT_STARTED = 0,
+		READ_PK = 1,
+		SORT = 2,
+		INSERT = 3,
+		/* JAN: TODO: MySQL 5.7 vrs. MariaDB sql/log.h
+		LOG_INDEX = 5,
+		LOG_TABLE = 6, */
+		LOG_INNODB_INDEX = 5,
+		LOG_INNODB_TABLE = 6,
+		END = 7,
+	}			m_cur_phase;
+};
+
+/** Destructor. */
+inline
+ut_stage_alter_t::~ut_stage_alter_t()
+{
+	if (m_progress == NULL) {
+		return;
+	}
+
+	/* Set completed = estimated before we quit. */
+	mysql_stage_set_work_completed(
+		m_progress,
+		mysql_stage_get_work_estimated(m_progress));
+
+	mysql_end_stage();
+}
+
+/** Flag an ALTER TABLE start (read primary key phase).
+@param[in]	n_sort_indexes	number of indexes that will be sorted
+during ALTER TABLE, used for estimating the total work to be done */
+inline
+void
+ut_stage_alter_t::begin_phase_read_pk(
+	ulint	n_sort_indexes)
+{
+	m_n_sort_indexes = n_sort_indexes;
+
+	m_cur_phase = READ_PK;
+
+	m_progress = mysql_set_stage(
+		srv_stage_alter_table_read_pk_internal_sort.m_key);
+
+	mysql_stage_set_work_completed(m_progress, 0);
+	reestimate();
+}
+
+/** Increment the number of records in PK (table) with 1.
+This is used to get more accurate estimate about the number of
+records per page which is needed because some phases work on
+per-page basis while some work on per-record basis and we want
+to get the progress as even as possible. */
+inline
+void
+ut_stage_alter_t::n_pk_recs_inc()
+{
+	m_n_pk_recs++;
+}
+
+/** Flag either one record or one page processed, depending on the
+current phase. */
+inline
+void
+ut_stage_alter_t::inc(ulint inc_val)
+{
+	if (m_progress == NULL) {
+		return;
+	}
+
+	ulint	multi_factor = 1;
+	bool	should_proceed = true;
+
+	switch (m_cur_phase) {
+	case NOT_STARTED:
+		ut_error;
+	case READ_PK:
+		m_n_pk_pages++;
+		ut_ad(inc_val == 1);
+		/* Overall the read pk phase will read all the pages from the
+		PK and will do work, proportional to the number of added
+		indexes, thus when this is called once per read page we
+		increment with 1 + m_n_sort_indexes */
+		inc_val = 1 + m_n_sort_indexes;
+		break;
+	case SORT:
+		multi_factor = m_sort_multi_factor;
+		/* fall through */
+	case INSERT: {
+		/* Increment the progress every nth record. During
+		sort and insert phases, this method is called once per
+		record processed. We need fractional point numbers here
+		because "records per page" is such a number naturally and
+		to avoid rounding skew we want, for example: if there are
+		(double) N records per page, then the work_completed
+	        should be incremented on the inc() calls round(k*N),
+		for k=1,2,3... */
+		const double	every_nth = m_n_recs_per_page *
+			static_cast<double>(multi_factor);
+
+		const ulint	k = static_cast<ulint>(
+			round(static_cast<double>(m_n_recs_processed) /
+			      every_nth));
+
+		const ulint	nth = static_cast<ulint>(
+			round(static_cast<double>(k) * every_nth));
+
+		should_proceed = m_n_recs_processed == nth;
+
+		m_n_recs_processed++;
+
+		break;
+	}
+	/* JAN: TODO: MySQL 5.7
+	case LOG_INDEX:
+		break;
+	case LOG_TABLE:
+	break; */
+	case LOG_INNODB_INDEX:
+	case LOG_INNODB_TABLE:
+		break;
+	case END:
+		break;
+	}
+
+	if (should_proceed) {
+		mysql_stage_inc_work_completed(m_progress, inc_val);
+		reestimate();
+	}
+}
+
+/** Flag the end of reading of the primary key.
+Here we know the exact number of pages and records and calculate
+the number of records per page and refresh the estimate. */
+inline
+void
+ut_stage_alter_t::end_phase_read_pk()
+{
+	reestimate();
+
+	if (m_n_pk_pages == 0) {
+		/* The number of pages in the PK could be 0 if the tree is
+		empty. In this case we set m_n_recs_per_page to 1 to avoid
+		division by zero later. */
+		m_n_recs_per_page = 1.0;
+	} else {
+		m_n_recs_per_page = std::max(
+			static_cast<double>(m_n_pk_recs)
+			/ static_cast<double>(m_n_pk_pages),
+			1.0);
+	}
+}
+
+/** Flag the beginning of the sort phase.
+@param[in]	sort_multi_factor	since merge sort processes
+one page more than once we only update the estimate once per this
+many pages processed. */
+inline
+void
+ut_stage_alter_t::begin_phase_sort(
+	double	sort_multi_factor)
+{
+	if (sort_multi_factor <= 1.0) {
+		m_sort_multi_factor = 1;
+	} else {
+		m_sort_multi_factor = static_cast<ulint>(
+			round(sort_multi_factor));
+	}
+
+	change_phase(&srv_stage_alter_table_merge_sort);
+}
+
+/** Flag the beginning of the insert phase. */
+inline
+void
+ut_stage_alter_t::begin_phase_insert()
+{
+	change_phase(&srv_stage_alter_table_insert);
+}
+
+/** Flag the beginning of the log index phase. */
+inline
+void
+ut_stage_alter_t::begin_phase_log_index()
+{
+	change_phase(&srv_stage_alter_table_log_index);
+}
+
+/** Flag the beginning of the log table phase. */
+inline
+void
+ut_stage_alter_t::begin_phase_log_table()
+{
+	change_phase(&srv_stage_alter_table_log_table);
+}
+
+/** Flag the beginning of the end phase. */
+inline
+void
+ut_stage_alter_t::begin_phase_end()
+{
+	change_phase(&srv_stage_alter_table_end);
+}
+
+/** Update the estimate of total work to be done. */
+inline
+void
+ut_stage_alter_t::reestimate()
+{
+	if (m_progress == NULL) {
+		return;
+	}
+
+	/* During the log table phase we calculate the estimate as
+	work done so far + log size remaining. */
+	if (m_cur_phase == LOG_INNODB_TABLE) {
+		mysql_stage_set_work_estimated(
+			m_progress,
+			mysql_stage_get_work_completed(m_progress)
+			+ row_log_estimate_work(m_pk));
+		return;
+	}
+
+	/* During the other phases we use a formula, regardless of
+	how much work has been done so far. */
+
+	/* For number of pages in the PK - if the PK has not been
+	read yet, use stat_n_leaf_pages (approximate), otherwise
+	use the exact number we gathered. */
+	const ulint	n_pk_pages
+		= m_cur_phase != READ_PK
+		? m_n_pk_pages
+		: m_pk->stat_n_leaf_pages;
+
+	ulonglong	estimate __attribute__((unused))
+		= n_pk_pages
+		* (1 /* read PK */
+		   + m_n_sort_indexes /* row_merge_buf_sort() inside the
+				      read PK per created index */
+		   + m_n_sort_indexes * 2 /* sort & insert per created index */)
+		+ row_log_estimate_work(m_pk);
+
+	/* Prevent estimate < completed */
+	estimate = std::max(estimate,
+			    mysql_stage_get_work_completed(m_progress));
+
+	mysql_stage_set_work_estimated(m_progress, estimate);
+}
+
+/** Change the current phase.
+@param[in]	new_stage	pointer to the new stage to change to */
+inline
+void
+ut_stage_alter_t::change_phase(
+	const PSI_stage_info*	new_stage)
+{
+	if (m_progress == NULL) {
+		return;
+	}
+
+	if (new_stage == &srv_stage_alter_table_read_pk_internal_sort) {
+		m_cur_phase = READ_PK;
+	} else if (new_stage == &srv_stage_alter_table_merge_sort) {
+		m_cur_phase = SORT;
+	} else if (new_stage == &srv_stage_alter_table_insert) {
+		m_cur_phase = INSERT;
+	/* JAN: TODO: MySQL 5.7 used LOG_INDEX and LOG_TABLE */
+	} else if (new_stage == &srv_stage_alter_table_log_index) {
+		m_cur_phase = LOG_INNODB_INDEX;
+	} else if (new_stage == &srv_stage_alter_table_log_table) {
+		m_cur_phase = LOG_INNODB_TABLE;
+	} else if (new_stage == &srv_stage_alter_table_end) {
+		m_cur_phase = END;
+	} else {
+		ut_error;
+	}
+
+	const ulonglong	c = mysql_stage_get_work_completed(m_progress);
+	const ulonglong	e = mysql_stage_get_work_estimated(m_progress);
+
+	m_progress = mysql_set_stage(new_stage->m_key);
+
+	mysql_stage_set_work_completed(m_progress, c);
+	mysql_stage_set_work_estimated(m_progress, e);
+}
+#else /* HAVE_PSI_STAGE_INTERFACE */
+
+class ut_stage_alter_t {
+public:
+	explicit ut_stage_alter_t(const dict_index_t*) {}
+
+	void begin_phase_read_pk(ulint)	{}
+
+	void n_pk_recs_inc() {}
+
+	void inc() {}
+	void inc(ulint) {}
+
+	void end_phase_read_pk() {}
+
+	void begin_phase_sort(double) {}
+
+	void begin_phase_insert() {}
+
+	void begin_phase_log_index() {}
+
+	void begin_phase_log_table() {}
+
+	void begin_phase_end() {}
+};
+
+#endif /* HAVE_PSI_STAGE_INTERFACE */
+
+#endif /* ut0stage_h */
diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h
new file mode 100644
index 00000000..fe16ce14
--- /dev/null
+++ b/storage/innobase/include/ut0ut.h
@@ -0,0 +1,444 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0ut.h
+Various utilities
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0ut_h
+#define ut0ut_h
+
+/* Do not include univ.i because univ.i includes this. */
+
+#include <ostream>
+#include <sstream>
+#include <string.h>
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "db0err.h"
+
+#include <time.h>
+
+#ifndef MYSQL_SERVER
+#include <ctype.h>
+#endif /* MYSQL_SERVER */
+
+#include <stdarg.h>
+
+#include <string>
+
+/** Index name prefix in fast index creation, as a string constant */
+#define TEMP_INDEX_PREFIX_STR	"\377"
+
+#define ut_max	std::max
+#define ut_min	std::min
+
+/** Calculate the minimum of two pairs.
+@param[out]	min_hi	MSB of the minimum pair
+@param[out]	min_lo	LSB of the minimum pair
+@param[in]	a_hi	MSB of the first pair
+@param[in]	a_lo	LSB of the first pair
+@param[in]	b_hi	MSB of the second pair
+@param[in]	b_lo	LSB of the second pair */
+UNIV_INLINE
+void
+ut_pair_min(
+	ulint*	min_hi,
+	ulint*	min_lo,
+	ulint	a_hi,
+	ulint	a_lo,
+	ulint	b_hi,
+	ulint	b_lo);
+/******************************************************//**
+Compares two ulints.
+@return 1 if a > b, 0 if a == b, -1 if a < b */
+UNIV_INLINE
+int
+ut_ulint_cmp(
+/*=========*/
+	ulint	a,	/*!< in: ulint */
+	ulint	b);	/*!< in: ulint */
+/** Compare two pairs of integers.
+@param[in]	a_h	more significant part of first pair
+@param[in]	a_l	less significant part of first pair
+@param[in]	b_h	more significant part of second pair
+@param[in]	b_l	less significant part of second pair
+@return comparison result of (a_h,a_l) and (b_h,b_l)
+@retval -1 if (a_h,a_l) is less than (b_h,b_l)
+@retval 0 if (a_h,a_l) is equal to (b_h,b_l)
+@retval 1 if (a_h,a_l) is greater than (b_h,b_l) */
+UNIV_INLINE
+int
+ut_pair_cmp(
+	ulint	a_h,
+	ulint	a_l,
+	ulint	b_h,
+	ulint	b_l)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*************************************************************//**
+Calculates fast the remainder of n/m when m is a power of two.
+@param n in: numerator
+@param m in: denominator, must be a power of two
+@return the remainder of n/m */
+template <typename T> inline T ut_2pow_remainder(T n, T m){return n & (m - 1);}
+/*************************************************************//**
+Calculates the biggest multiple of m that is not bigger than n
+when m is a power of two.  In other words, rounds n down to m * k.
+@param n in: number to round down
+@param m in: alignment, must be a power of two
+@return n rounded down to the biggest possible integer multiple of m */
+template <typename T> inline T ut_2pow_round(T n, T m) { return n & ~(m - 1); }
+/********************************************************//**
+Calculates the smallest multiple of m that is not smaller than n
+when m is a power of two.  In other words, rounds n up to m * k.
+@param n in: number to round up
+@param m in: alignment, must be a power of two
+@return n rounded up to the smallest possible integer multiple of m */
+#define UT_CALC_ALIGN(n, m) ((n + m - 1) & ~(m - 1))
+template <typename T> inline T ut_calc_align(T n, T m)
+{ return static_cast<T>(UT_CALC_ALIGN(n, m)); }
+
+/*************************************************************//**
+Calculates fast the 2-logarithm of a number, rounded upward to an
+integer.
+@return logarithm in the base 2, rounded upward */
+UNIV_INLINE
+ulint
+ut_2_log(
+/*=====*/
+	ulint	n);	/*!< in: number */
+/*************************************************************//**
+Calculates 2 to power n.
+@return 2 to power n */
+UNIV_INLINE
+ulint
+ut_2_exp(
+/*=====*/
+	ulint	n);	/*!< in: number */
+
+/**********************************************************//**
+Returns the number of milliseconds since some epoch.  The
+value may wrap around.  It should only be used for heuristic
+purposes.
+@return ms since epoch */
+ulint
+ut_time_ms(void);
+/*============*/
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Determine how many bytes (groups of 8 bits) are needed to
+store the given number of bits.
+@param b in: bits
+@return number of bytes (octets) needed to represent b */
+#define UT_BITS_IN_BYTES(b) (((b) + 7) >> 3)
+
+/** Determines if a number is zero or a power of two.
+@param[in]	n	number
+@return nonzero if n is zero or a power of two; zero otherwise */
+#define ut_is_2pow(n) (!((n) & ((n) - 1)))
+
+/** Functor that compares two C strings. Can be used as a comparator for
+e.g. std::map that uses char* as keys. */
+struct ut_strcmp_functor
+{
+	bool operator()(
+		const char*	a,
+		const char*	b) const
+	{
+		return(strcmp(a, b) < 0);
+	}
+};
+
+/**********************************************************//**
+Prints a timestamp to a file. */
+void
+ut_print_timestamp(
+/*===============*/
+	FILE*	file)	/*!< in: file where to print */
+	ATTRIBUTE_COLD __attribute__((nonnull));
+
+#ifndef UNIV_INNOCHECKSUM
+
+/**********************************************************//**
+Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */
+void
+ut_sprintf_timestamp(
+/*=================*/
+	char*	buf); /*!< in: buffer where to sprintf */
+
+/*************************************************************//**
+Prints the contents of a memory buffer in hex and ascii. */
+void
+ut_print_buf(
+/*=========*/
+	FILE*		file,	/*!< in: file where to print */
+	const void*	buf,	/*!< in: memory buffer */
+	ulint		len);	/*!< in: length of the buffer */
+
+/*************************************************************//**
+Prints the contents of a memory buffer in hex. */
+void
+ut_print_buf_hex(
+/*=============*/
+	std::ostream&	o,	/*!< in/out: output stream */
+	const void*	buf,	/*!< in: memory buffer */
+	ulint		len)	/*!< in: length of the buffer */
+	MY_ATTRIBUTE((nonnull));
+/*************************************************************//**
+Prints the contents of a memory buffer in hex and ascii. */
+void
+ut_print_buf(
+/*=========*/
+	std::ostream&	o,	/*!< in/out: output stream */
+	const void*	buf,	/*!< in: memory buffer */
+	ulint		len)	/*!< in: length of the buffer */
+	MY_ATTRIBUTE((nonnull));
+
+/* Forward declaration of transaction handle */
+struct trx_t;
+
+/** Get a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier.
+ @param		[in]	trx		transaction (NULL=no quotes).
+ @param		[in]	name		table name.
+ @retval	String quoted as an SQL identifier.
+*/
+std::string
+ut_get_name(
+	const trx_t*	trx,
+	const char*	name);
+
+/**********************************************************************//**
+Outputs a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier. */
+void
+ut_print_name(
+/*==========*/
+	FILE*		ef,	/*!< in: stream */
+	const trx_t*	trx,	/*!< in: transaction */
+	const char*	name);	/*!< in: table name to print */
+/** Format a table name, quoted as an SQL identifier.
+If the name contains a slash '/', the result will contain two
+identifiers separated by a period (.), as in SQL
+database_name.table_name.
+@see table_name_t
+@param[in]	name		table or index name
+@param[out]	formatted	formatted result, will be NUL-terminated
+@param[in]	formatted_size	size of the buffer in bytes
+@return pointer to 'formatted' */
+char*
+ut_format_name(
+	const char*	name,
+	char*		formatted,
+	ulint		formatted_size);
+
+/**********************************************************************//**
+Catenate files. */
+void
+ut_copy_file(
+/*=========*/
+	FILE*	dest,	/*!< in: output file */
+	FILE*	src);	/*!< in: input file to be appended to output */
+
+/*************************************************************//**
+Convert an error number to a human readable text message. The
+returned string is static and should not be freed or modified.
+@return string, describing the error */
+const char*
+ut_strerr(
+/*======*/
+	dberr_t	num);	/*!< in: error number */
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+namespace ib {
+
+/** This is a wrapper class, used to print any unsigned integer type
+in hexadecimal format.  The main purpose of this data type is to
+overload the global operator<<, so that we can print the given
+wrapper value in hex. */
+struct hex {
+	explicit hex(uintmax_t t): m_val(t) {}
+	const uintmax_t	m_val;
+};
+
+/** This is an overload of the global operator<< for the user defined type
+ib::hex.  The unsigned value held in the ib::hex wrapper class will be printed
+into the given output stream in hexadecimal format.
+@param[in,out]	lhs	the output stream into which rhs is written.
+@param[in]	rhs	the object to be written into lhs.
+@retval	reference to the output stream. */
+inline
+std::ostream&
+operator<<(
+	std::ostream&	lhs,
+	const hex&	rhs)
+{
+	std::ios_base::fmtflags	ff = lhs.flags();
+	lhs << std::showbase << std::hex << rhs.m_val;
+	lhs.setf(ff);
+	return(lhs);
+}
+
+/** This is a wrapper class, used to print any number in IEC style */
+struct bytes_iec {
+  explicit bytes_iec(unsigned long long t): m_val(t) {}
+  double get_double() const { return static_cast<double>(m_val); }
+  const unsigned long long m_val;
+};
+
+/** Like hex operator above, except for bytes_iec */
+std::ostream &operator<<(std::ostream &lhs, const bytes_iec &rhs);
+
+/** The class logger is the base class of all the error log related classes.
+It contains a std::ostringstream object.  The main purpose of this class is
+to forward operator<< to the underlying std::ostringstream object.  Do not
+use this class directly, instead use one of the derived classes. */
+class logger
+{
+protected:
+  /* This class must not be used directly */
+  ATTRIBUTE_COLD ATTRIBUTE_NOINLINE logger() = default;
+public:
+  template<typename T> ATTRIBUTE_COLD ATTRIBUTE_NOINLINE
+  logger& operator<<(const T& rhs)
+  {
+    m_oss << rhs;
+    return *this;
+  }
+
+  /** Handle a fixed character string in the same way as a pointer to
+  an unknown-length character string, to reduce object code bloat. */
+  template<size_t N> logger& operator<<(const char (&rhs)[N])
+  { return *this << static_cast<const char*>(rhs); }
+
+  /** Output an error code name */
+  ATTRIBUTE_COLD logger& operator<<(dberr_t err);
+
+  /** Append a string.
+  @param buf   string buffer
+  @param size  buffer size
+  @return the output stream */
+  ATTRIBUTE_COLD __attribute__((noinline))
+  std::ostream &write(const char *buf, std::streamsize size)
+  {
+    return m_oss.write(buf, size);
+  }
+
+  std::ostream &write(const byte *buf, std::streamsize size)
+  { return write(reinterpret_cast<const char*>(buf), size); }
+
+  std::ostringstream m_oss;
+};
+
+/** The class info is used to emit informational log messages.  It is to be
+used similar to std::cout.  But the log messages will be emitted only when
+the dtor is called.  The preferred usage of this class is to make use of
+unnamed temporaries as follows:
+
+info() << "The server started successfully.";
+
+In the above usage, the temporary object will be destroyed at the end of the
+statement and hence the log message will be emitted at the end of the
+statement.  If a named object is created, then the log message will be emitted
+only when it goes out of scope or destroyed. */
+class info : public logger {
+public:
+	ATTRIBUTE_COLD
+	~info();
+};
+
+/** The class warn is used to emit warnings.  Refer to the documentation of
+class info for further details. */
+class warn : public logger {
+public:
+	ATTRIBUTE_COLD
+	~warn();
+};
+
+/** The class error is used to emit error messages.  Refer to the
+documentation of class info for further details. */
+class error : public logger {
+public:
+	ATTRIBUTE_COLD
+	~error();
+	/** Indicates that error::~error() was invoked. Can be used to
+	determine if error messages were logged during innodb code execution.
+	@return true if there were error messages, false otherwise. */
+	static bool was_logged() { return logged; }
+
+private:
+	/** true if error::~error() was invoked, false otherwise */
+	static bool logged;
+};
+
+/** The class fatal is used to emit an error message and stop the server
+by crashing it.  Use this class when MySQL server needs to be stopped
+immediately.  Refer to the documentation of class info for usage details. */
+class fatal : public logger {
+public:
+	ATTRIBUTE_NORETURN
+	~fatal();
+};
+
+/** Emit an error message if the given predicate is true, otherwise emit a
+warning message */
+class error_or_warn : public logger {
+public:
+	ATTRIBUTE_COLD
+	error_or_warn(bool	pred)
+	: m_error(pred)
+	{}
+
+	ATTRIBUTE_COLD
+	~error_or_warn();
+private:
+	const bool	m_error;
+};
+
+/** Emit a fatal message if the given predicate is true, otherwise emit a
+error message. */
+class fatal_or_error : public logger {
+public:
+	ATTRIBUTE_COLD
+	fatal_or_error(bool	pred)
+	: m_fatal(pred)
+	{}
+
+	ATTRIBUTE_COLD
+	~fatal_or_error();
+private:
+	const bool	m_fatal;
+};
+
+} // namespace ib
+
+#include "ut0ut.inl"
+
+#endif
+
diff --git a/storage/innobase/include/ut0ut.inl b/storage/innobase/include/ut0ut.inl
new file mode 100644
index 00000000..73feaf82
--- /dev/null
+++ b/storage/innobase/include/ut0ut.inl
@@ -0,0 +1,143 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0ut.ic
+Various utilities
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+#include <algorithm>
+
+/** Calculate the minimum of two pairs.
+@param[out]	min_hi	MSB of the minimum pair
+@param[out]	min_lo	LSB of the minimum pair
+@param[in]	a_hi	MSB of the first pair
+@param[in]	a_lo	LSB of the first pair
+@param[in]	b_hi	MSB of the second pair
+@param[in]	b_lo	LSB of the second pair */
+UNIV_INLINE
+void
+ut_pair_min(
+	ulint*	min_hi,
+	ulint*	min_lo,
+	ulint	a_hi,
+	ulint	a_lo,
+	ulint	b_hi,
+	ulint	b_lo)
+{
+	if (a_hi == b_hi) {
+		*min_hi = a_hi;
+		*min_lo = std::min(a_lo, b_lo);
+	} else if (a_hi < b_hi) {
+		*min_hi = a_hi;
+		*min_lo = a_lo;
+	} else {
+		*min_hi = b_hi;
+		*min_lo = b_lo;
+	}
+}
+
+/******************************************************//**
+Compares two ulints.
+@return 1 if a > b, 0 if a == b, -1 if a < b */
+UNIV_INLINE
+int
+ut_ulint_cmp(
+/*=========*/
+	ulint	a,	/*!< in: ulint */
+	ulint	b)	/*!< in: ulint */
+{
+	if (a < b) {
+		return(-1);
+	} else if (a == b) {
+		return(0);
+	} else {
+		return(1);
+	}
+}
+
+/** Compare two pairs of integers.
+@param[in]	a_h	more significant part of first pair
+@param[in]	a_l	less significant part of first pair
+@param[in]	b_h	more significant part of second pair
+@param[in]	b_l	less significant part of second pair
+@return comparison result of (a_h,a_l) and (b_h,b_l)
+@retval -1 if (a_h,a_l) is less than (b_h,b_l)
+@retval 0 if (a_h,a_l) is equal to (b_h,b_l)
+@retval 1 if (a_h,a_l) is greater than (b_h,b_l) */
+UNIV_INLINE
+int
+ut_pair_cmp(
+	ulint	a_h,
+	ulint	a_l,
+	ulint	b_h,
+	ulint	b_l)
+{
+	if (a_h < b_h) {
+		return(-1);
+	}
+	if (a_h > b_h) {
+		return(1);
+	}
+	return(ut_ulint_cmp(a_l, b_l));
+}
+
+/*************************************************************//**
+Calculates fast the 2-logarithm of a number, rounded upward to an
+integer.
+@return logarithm in the base 2, rounded upward */
+UNIV_INLINE
+ulint
+ut_2_log(
+/*=====*/
+	ulint	n)	/*!< in: number != 0 */
+{
+	ulint	res;
+
+	res = 0;
+
+	ut_ad(n > 0);
+
+	n = n - 1;
+
+	for (;;) {
+		n = n / 2;
+
+		if (n == 0) {
+			break;
+		}
+
+		res++;
+	}
+
+	return(res + 1);
+}
+
+/*************************************************************//**
+Calculates 2 to power n.
+@return 2 to power n */
+UNIV_INLINE
+ulint
+ut_2_exp(
+/*=====*/
+	ulint	n)	/*!< in: number */
+{
+	return((ulint) 1 << n);
+}
diff --git a/storage/innobase/include/ut0vec.h b/storage/innobase/include/ut0vec.h
new file mode 100644
index 00000000..f4660f96
--- /dev/null
+++ b/storage/innobase/include/ut0vec.h
@@ -0,0 +1,285 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0vec.h
+A vector of pointers to data items
+
+Created 4/6/2006 Osku Salerma
+************************************************************************/
+
+#ifndef IB_VECTOR_H
+#define IB_VECTOR_H
+
+#include "mem0mem.h"
+
+struct ib_alloc_t;
+struct ib_vector_t;
+
+typedef void* (*ib_mem_alloc_t)(
+					/* out: Pointer to allocated memory */
+	ib_alloc_t*	allocator,	/* in: Pointer to allocator instance */
+	ulint		size);		/* in: Number of bytes to allocate */
+
+typedef void (*ib_mem_free_t)(
+	ib_alloc_t*	allocator,	/* in: Pointer to allocator instance */
+	void*		ptr);		/* in: Memory to free */
+
+typedef void* (*ib_mem_resize_t)(
+					/* out: Pointer to resized memory */
+	ib_alloc_t*	allocator,	/* in: Pointer to allocator */
+	void*		ptr,		/* in: Memory to resize */
+	ulint		old_size,	/* in: Old memory size in bytes */
+	ulint		new_size);	/* in: New size in bytes */
+
+typedef int (*ib_compare_t)(const void*, const void*);
+
+/* An automatically resizing vector datatype with the following properties:
+
+ -All memory allocation is done through an allocator, which is  responsible for
+freeing it when done with the vector.
+*/
+
+/* This is useful shorthand for elements of type void* */
+#define	ib_vector_getp(v, n)	(*(void**) ib_vector_get(v, n))
+#define	ib_vector_getp_const(v, n)	(*(void**) ib_vector_get_const(v, n))
+
+#define ib_vector_allocator(v)	(v->allocator)
+
+/********************************************************************
+Create a new vector with the given initial size. */
+ib_vector_t*
+ib_vector_create(
+/*=============*/
+					/* out: vector */
+	ib_alloc_t*	alloc,		/* in: Allocator */
+					/* in: size of the data item */
+	ulint		sizeof_value,
+	ulint		size);		/* in: initial size */
+
+/********************************************************************
+Destroy the vector. Make sure the vector owns the allocator, e.g.,
+the heap in the the heap allocator. */
+UNIV_INLINE
+void
+ib_vector_free(
+/*===========*/
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/********************************************************************
+Push a new element to the vector, increasing its size if necessary,
+if elem is not NULL then elem is copied to the vector.*/
+UNIV_INLINE
+void*
+ib_vector_push(
+/*===========*/
+					/* out: pointer the "new" element */
+	ib_vector_t*	vec,		/* in/out: vector */
+	const void*	elem);		/* in: data element */
+
+/********************************************************************
+Pop the last element from the vector.*/
+UNIV_INLINE
+void*
+ib_vector_pop(
+/*==========*/
+					/* out: pointer to the "new" element */
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/*******************************************************************//**
+Remove an element to the vector
+@return pointer to the "removed" element */
+UNIV_INLINE
+void*
+ib_vector_remove(
+/*=============*/
+	ib_vector_t*	vec,	/*!< in: vector */
+	const void*	elem);	/*!< in: value to remove */
+
+/********************************************************************
+Get the number of elements in the vector. */
+UNIV_INLINE
+ulint
+ib_vector_size(
+/*===========*/
+					/* out: number of elements in vector */
+	const ib_vector_t*	vec);	/* in: vector */
+
+/********************************************************************
+Increase the size of the vector. */
+void
+ib_vector_resize(
+/*=============*/
+					/* out: number of elements in vector */
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/********************************************************************
+Test whether a vector is empty or not.
+@return TRUE if empty */
+UNIV_INLINE
+ibool
+ib_vector_is_empty(
+/*===============*/
+	const ib_vector_t*	vec);    /*!< in: vector */
+
+/****************************************************************//**
+Get the n'th element.
+@return n'th element */
+UNIV_INLINE
+void*
+ib_vector_get(
+/*==========*/
+	ib_vector_t*	vec,	/*!< in: vector */
+	ulint		n);	/*!< in: element index to get */
+
+/********************************************************************
+Const version of the get n'th element.
+@return n'th element */
+UNIV_INLINE
+const void*
+ib_vector_get_const(
+/*================*/
+	const ib_vector_t*	vec,	/* in: vector */
+	ulint			n);	/* in: element index to get */
+/****************************************************************//**
+Get last element. The vector must not be empty.
+@return last element */
+UNIV_INLINE
+void*
+ib_vector_get_last(
+/*===============*/
+	ib_vector_t*	vec);	/*!< in: vector */
+/****************************************************************//**
+Set the n'th element. */
+UNIV_INLINE
+void
+ib_vector_set(
+/*==========*/
+	ib_vector_t*	vec,	/*!< in/out: vector */
+	ulint		n,	/*!< in: element index to set */
+	void*		elem);	/*!< in: data element */
+
+/********************************************************************
+Reset the vector size to 0 elements. */
+UNIV_INLINE
+void
+ib_vector_reset(
+/*============*/
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+void*
+ib_vector_last(
+/*===========*/
+					/* out: pointer to last element */
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+const void*
+ib_vector_last_const(
+/*=================*/
+					/* out: pointer to last element */
+	const ib_vector_t*	vec);	/* in: vector */
+
+/********************************************************************
+Sort the vector elements. */
+UNIV_INLINE
+void
+ib_vector_sort(
+/*===========*/
+	ib_vector_t*	vec,		/* in/out: vector */
+	ib_compare_t	compare);	/* in: the comparator to use for sort */
+
+/********************************************************************
+The default ib_vector_t heap free. Does nothing. */
+UNIV_INLINE
+void
+ib_heap_free(
+/*=========*/
+	ib_alloc_t*	allocator,	/* in: allocator */
+	void*		ptr);		/* in: size in bytes */
+
+/********************************************************************
+The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_malloc(
+/*===========*/
+					/* out: pointer to allocated memory */
+	ib_alloc_t*	allocator,	/* in: allocator */
+	ulint		size);		/* in: size in bytes */
+
+/********************************************************************
+The default ib_vector_t heap resize. Since we can't resize the heap
+we have to copy the elements from the old ptr to the new ptr.
+Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_resize(
+/*===========*/
+					/* out: pointer to reallocated
+					memory */
+	ib_alloc_t*	allocator,	/* in: allocator */
+	void*		old_ptr,	/* in: pointer to memory */
+	ulint		old_size,	/* in: old size in bytes */
+	ulint		new_size);	/* in: new size in bytes */
+
+/********************************************************************
+Create a heap allocator that uses the passed in heap. */
+UNIV_INLINE
+ib_alloc_t*
+ib_heap_allocator_create(
+/*=====================*/
+					/* out: heap allocator instance */
+	mem_heap_t*	heap);		/* in: heap to use */
+
+/********************************************************************
+Free a heap allocator. */
+UNIV_INLINE
+void
+ib_heap_allocator_free(
+/*===================*/
+	ib_alloc_t*	ib_ut_alloc);	/* in: alloc instace to free */
+
+/* Allocator used by ib_vector_t. */
+struct ib_alloc_t {
+	ib_mem_alloc_t	mem_malloc;	/* For allocating memory */
+	ib_mem_free_t	mem_release;	/* For freeing memory */
+	ib_mem_resize_t	mem_resize;	/* For resizing memory */
+	void*		arg;		/* Currently if not NULL then it
+					points to the heap instance */
+};
+
+/* See comment at beginning of file. */
+struct ib_vector_t {
+	ib_alloc_t*	allocator;	/* Allocator, because one size
+					doesn't fit all */
+	void*		data;		/* data elements */
+	ulint		used;		/* number of elements currently used */
+	ulint		total;		/* number of elements allocated */
+					/* Size of a data item */
+	ulint		sizeof_value;
+};
+
+#include "ut0vec.inl"
+
+#endif /* IB_VECTOR_H */
diff --git a/storage/innobase/include/ut0vec.inl b/storage/innobase/include/ut0vec.inl
new file mode 100644
index 00000000..531f0f22
--- /dev/null
+++ b/storage/innobase/include/ut0vec.inl
@@ -0,0 +1,348 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0vec.ic
+A vector of pointers to data items
+
+Created 4/6/2006 Osku Salerma
+************************************************************************/
+
+#define	IB_VEC_OFFSET(v, i)	(vec->sizeof_value * i)
+
+/********************************************************************
+The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_malloc(
+/*===========*/
+	ib_alloc_t*	allocator,	/* in: allocator */
+	ulint		size)		/* in: size in bytes */
+{
+	mem_heap_t*	heap = (mem_heap_t*) allocator->arg;
+
+	return(mem_heap_alloc(heap, size));
+}
+
+/********************************************************************
+The default ib_vector_t heap free. Does nothing. */
+UNIV_INLINE
+void
+ib_heap_free(
+/*=========*/
+	ib_alloc_t*	allocator UNIV_UNUSED,	/* in: allocator */
+	void*		ptr UNIV_UNUSED)	/* in: size in bytes */
+{
+	/* We can't free individual elements. */
+}
+
+/********************************************************************
+The default ib_vector_t heap resize. Since we can't resize the heap
+we have to copy the elements from the old ptr to the new ptr.
+We always assume new_size >= old_size, so the buffer won't overflow.
+Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_resize(
+/*===========*/
+	ib_alloc_t*	allocator,	/* in: allocator */
+	void*		old_ptr,	/* in: pointer to memory */
+	ulint		old_size,	/* in: old size in bytes */
+	ulint		new_size)	/* in: new size in bytes */
+{
+	void*		new_ptr;
+	mem_heap_t*	heap = (mem_heap_t*) allocator->arg;
+
+	ut_a(new_size >= old_size);
+	new_ptr = mem_heap_alloc(heap, new_size);
+	memcpy(new_ptr, old_ptr, old_size);
+
+	return(new_ptr);
+}
+
+/********************************************************************
+Create a heap allocator that uses the passed in heap. */
+UNIV_INLINE
+ib_alloc_t*
+ib_heap_allocator_create(
+/*=====================*/
+	mem_heap_t*	heap)		/* in: heap to use */
+{
+	ib_alloc_t*	heap_alloc;
+
+	heap_alloc = (ib_alloc_t*) mem_heap_alloc(heap, sizeof(*heap_alloc));
+
+	heap_alloc->arg = heap;
+	heap_alloc->mem_release = ib_heap_free;
+	heap_alloc->mem_malloc = ib_heap_malloc;
+	heap_alloc->mem_resize = ib_heap_resize;
+
+	return(heap_alloc);
+}
+
+/********************************************************************
+Free a heap allocator. */
+UNIV_INLINE
+void
+ib_heap_allocator_free(
+/*===================*/
+	ib_alloc_t*	ib_ut_alloc)	/* in: alloc instace to free */
+{
+	mem_heap_free((mem_heap_t*) ib_ut_alloc->arg);
+}
+
+/********************************************************************
+Get number of elements in vector. */
+UNIV_INLINE
+ulint
+ib_vector_size(
+/*===========*/
+					/* out: number of elements in vector*/
+	const ib_vector_t*	vec)	/* in: vector */
+{
+	return(vec->used);
+}
+
+/****************************************************************//**
+Get n'th element. */
+UNIV_INLINE
+void*
+ib_vector_get(
+/*==========*/
+	ib_vector_t*	vec,	/*!< in: vector */
+	ulint		n)	/*!< in: element index to get */
+{
+	ut_a(n < vec->used);
+
+	return((byte*) vec->data + IB_VEC_OFFSET(vec, n));
+}
+
+/********************************************************************
+Const version of the get n'th element.
+@return n'th element */
+UNIV_INLINE
+const void*
+ib_vector_get_const(
+/*================*/
+	const ib_vector_t*	vec,	/* in: vector */
+	ulint			n)	/* in: element index to get */
+{
+	ut_a(n < vec->used);
+
+	return((byte*) vec->data + IB_VEC_OFFSET(vec, n));
+}
+/****************************************************************//**
+Get last element. The vector must not be empty.
+@return last element */
+UNIV_INLINE
+void*
+ib_vector_get_last(
+/*===============*/
+	ib_vector_t*	vec)	/*!< in: vector */
+{
+	ut_a(vec->used > 0);
+
+	return((byte*) ib_vector_get(vec, vec->used - 1));
+}
+
+/****************************************************************//**
+Set the n'th element. */
+UNIV_INLINE
+void
+ib_vector_set(
+/*==========*/
+	ib_vector_t*	vec,	/*!< in/out: vector */
+	ulint		n,	/*!< in: element index to set */
+	void*		elem)	/*!< in: data element */
+{
+	void*		slot;
+
+	ut_a(n < vec->used);
+
+	slot = ((byte*) vec->data + IB_VEC_OFFSET(vec, n));
+	memcpy(slot, elem, vec->sizeof_value);
+}
+
+/********************************************************************
+Reset the vector size to 0 elements. */
+UNIV_INLINE
+void
+ib_vector_reset(
+/*============*/
+					/* out: void */
+	ib_vector_t*	vec)		/* in: vector */
+{
+	vec->used = 0;
+}
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+void*
+ib_vector_last(
+/*===========*/
+					/* out: void */
+	ib_vector_t*	vec)		/* in: vector */
+{
+	ut_a(ib_vector_size(vec) > 0);
+
+	return(ib_vector_get(vec, ib_vector_size(vec) - 1));
+}
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+const void*
+ib_vector_last_const(
+/*=================*/
+					/* out: void */
+	const ib_vector_t*	vec)	/* in: vector */
+{
+	ut_a(ib_vector_size(vec) > 0);
+
+	return(ib_vector_get_const(vec, ib_vector_size(vec) - 1));
+}
+
+/****************************************************************//**
+Remove the last element from the vector.
+@return last vector element */
+UNIV_INLINE
+void*
+ib_vector_pop(
+/*==========*/
+				/* out: pointer to element */
+	ib_vector_t*	vec)	/* in: vector */
+{
+	void*		elem;
+
+	ut_a(vec->used > 0);
+
+	elem = ib_vector_last(vec);
+	--vec->used;
+
+	return(elem);
+}
+
+/********************************************************************
+Append an element to the vector, if elem != NULL then copy the data
+from elem.*/
+UNIV_INLINE
+void*
+ib_vector_push(
+/*===========*/
+				/* out: pointer to the "new" element */
+	ib_vector_t*	vec,	/* in: vector */
+	const void*	elem)	/* in: element to add (can be NULL) */
+{
+	void*		last;
+
+	if (vec->used >= vec->total) {
+		ib_vector_resize(vec);
+	}
+
+	last = (byte*) vec->data + IB_VEC_OFFSET(vec, vec->used);
+
+#ifdef UNIV_DEBUG
+	memset(last, 0, vec->sizeof_value);
+#endif
+
+	if (elem) {
+		memcpy(last, elem, vec->sizeof_value);
+	}
+
+	++vec->used;
+
+	return(last);
+}
+
+/*******************************************************************//**
+Remove an element to the vector
+@return pointer to the "removed" element */
+UNIV_INLINE
+void*
+ib_vector_remove(
+/*=============*/
+	ib_vector_t*	vec,	/*!< in: vector */
+	const void*	elem)	/*!< in: value to remove */
+{
+	void*		current = NULL;
+	void*		next;
+	ulint		i;
+	ulint		old_used_count = vec->used;
+
+	for (i = 0; i < vec->used; i++) {
+		current = ib_vector_get(vec, i);
+
+		if (*(void**) current == elem) {
+			if (i == vec->used - 1) {
+				return(ib_vector_pop(vec));
+			}
+
+			next = ib_vector_get(vec, i + 1);
+			memmove(current, next, vec->sizeof_value
+			        * (vec->used - i - 1));
+			--vec->used;
+			break;
+		}
+	}
+
+	return((old_used_count != vec->used) ? current : NULL);
+}
+
+/********************************************************************
+Sort the vector elements. */
+UNIV_INLINE
+void
+ib_vector_sort(
+/*===========*/
+				/* out: void */
+	ib_vector_t*	vec,	/* in: vector */
+	ib_compare_t	compare)/* in: the comparator to use for sort */
+{
+	qsort(vec->data, vec->used, vec->sizeof_value, compare);
+}
+
+/********************************************************************
+Destroy the vector. Make sure the vector owns the allocator, e.g.,
+the heap in the the heap allocator. */
+UNIV_INLINE
+void
+ib_vector_free(
+/*===========*/
+	ib_vector_t*	vec)		/* in, own: vector */
+{
+	/* Currently we only support one type of allocator - heap,
+	when the heap is freed all the elements are freed too. */
+
+	/* Only the heap allocator uses the arg field. */
+	ut_ad(vec->allocator->arg != NULL);
+
+	mem_heap_free((mem_heap_t*) vec->allocator->arg);
+}
+
+/********************************************************************
+Test whether a vector is empty or not.
+@return TRUE if empty */
+UNIV_INLINE
+ibool
+ib_vector_is_empty(
+/*===============*/
+	const ib_vector_t*	vec)	/*!< in: vector */
+{
+	return(ib_vector_size(vec) == 0);
+}
diff --git a/storage/innobase/include/ut0wqueue.h b/storage/innobase/include/ut0wqueue.h
new file mode 100644
index 00000000..95c7a248
--- /dev/null
+++ b/storage/innobase/include/ut0wqueue.h
@@ -0,0 +1,86 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0wqueue.h
+A work queue
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/*******************************************************************//**
+A Work queue. Threads can add work items to the queue and other threads can
+wait for work items to be available and take them off the queue for
+processing.
+************************************************************************/
+
+#pragma once
+
+#include "ut0list.h"
+#include "mem0mem.h"
+
+// Forward declaration
+struct ib_list_t;
+
+/** Work queue */
+struct ib_wqueue_t
+{
+  /** Mutex protecting everything */
+  mysql_mutex_t mutex;
+  /** Work item list */
+  ib_list_t *items;
+  /** ib_list_len(*items) */
+  size_t length;
+};
+
+/****************************************************************//**
+Create a new work queue.
+@return work queue */
+ib_wqueue_t*
+ib_wqueue_create();
+/*===============*/
+
+/****************************************************************//**
+Free a work queue. */
+void
+ib_wqueue_free(
+/*===========*/
+	ib_wqueue_t*	wq);		/*!< in: work queue */
+
+/** Add a work item to the queue.
+@param[in,out]	wq		work queue
+@param[in]	item		work item
+@param[in,out]	heap		memory heap to use for allocating list node
+@param[in]	wq_locked	work queue mutex locked */
+void
+ib_wqueue_add(ib_wqueue_t* wq, void* item, mem_heap_t* heap,
+	      bool wq_locked = false);
+
+/** Check if queue is empty.
+@param wq wait queue
+@return whether the queue is empty */
+bool ib_wqueue_is_empty(ib_wqueue_t* wq);
+
+/********************************************************************
+Return first item on work queue or NULL if queue is empty
+@return work item or NULL */
+void*
+ib_wqueue_nowait(
+/*=============*/
+	ib_wqueue_t*	wq);		/*<! in: work queue */