Adding upstream version 1:10.5.12.upstream/1%10.5.12 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 18:07:14 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 18:07:14 +0000
commit: a175314c3e5827eb193872241446f2f8f5c9d33c (patch)
tree: cd3d60ca99ae00829c52a6ca79150a5b6e62528b /storage/innobase/btr
parent: Initial commit. (diff)
download: mariadb-10.5-a175314c3e5827eb193872241446f2f8f5c9d33c.tar.xz
mariadb-10.5-a175314c3e5827eb193872241446f2f8f5c9d33c.zip
6 files changed, 18605 insertions, 0 deletions
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc
new file mode 100644
index 00000000..de87ad02
--- /dev/null
+++ b/storage/innobase/btr/btr0btr.cc
@@ -0,0 +1,5192 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2014, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0btr.cc
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "btr0btr.h"
+
+#include "page0page.h"
+#include "page0zip.h"
+#include "gis0rtree.h"
+
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "btr0pcur.h"
+#include "btr0defragment.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "ibuf0ibuf.h"
+#include "trx0trx.h"
+#include "srv0mon.h"
+#include "gis0geo.h"
+#include "dict0boot.h"
+#include "row0sel.h" /* row_search_max_autoinc() */
+
+Atomic_counter<uint32_t> btr_validate_index_running;
+
+/**************************************************************//**
+Checks if the page in the cursor can be merged with given page.
+If necessary, re-organize the merge_page.
+@return	true if possible to merge. */
+static
+bool
+btr_can_merge_with_page(
+/*====================*/
+	btr_cur_t*	cursor,		/*!< in: cursor on the page to merge */
+	uint32_t	page_no,	/*!< in: a sibling page */
+	buf_block_t**	merge_block,	/*!< out: the merge block */
+	mtr_t*		mtr);		/*!< in: mini-transaction */
+
+/** Report that an index page is corrupted.
+@param[in]	buffer block
+@param[in]	index tree */
+void btr_corruption_report(const buf_block_t* block, const dict_index_t* index)
+{
+	ib::fatal()
+		<< "Flag mismatch in page " << block->page.id()
+		<< " index " << index->name
+		<< " of table " << index->table->name;
+}
+
+/*
+Latching strategy of the InnoDB B-tree
+--------------------------------------
+
+Node pointer page latches acquisition is protected by index->lock latch.
+
+Before MariaDB 10.2.2, all node pointer pages were protected by index->lock
+either in S (shared) or X (exclusive) mode and block->lock was not acquired on
+node pointer pages.
+
+After MariaDB 10.2.2, block->lock S-latch or X-latch is used to protect
+node pointer pages and obtaiment of node pointer page latches is protected by
+index->lock.
+
+(0) Definition: B-tree level.
+
+(0.1) The leaf pages of the B-tree are at level 0.
+
+(0.2) The parent of a page at level L has level L+1. (The level of the
+root page is equal to the tree height.)
+
+(0.3) The B-tree lock (index->lock) is the parent of the root page and
+has a level = tree height + 1.
+
+Index->lock has 3 possible locking modes:
+
+(1) S-latch:
+
+(1.1) All latches for pages must be obtained in descending order of tree level.
+
+(1.2) Before obtaining the first node pointer page latch at a given B-tree
+level, parent latch must be held (at level +1 ).
+
+(1.3) If a node pointer page is already latched at the same level
+we can only obtain latch to its right sibling page latch at the same level.
+
+(1.4) Release of the node pointer page latches must be done in
+child-to-parent order. (Prevents deadlocks when obtained index->lock
+in SX mode).
+
+(1.4.1) Level L node pointer page latch can be released only when
+no latches at children level i.e. level < L are hold.
+
+(1.4.2) All latches from node pointer pages must be released so
+that no latches are obtained between.
+
+(1.5) [implied by (1.1), (1.2)] Root page latch must be first node pointer
+latch obtained.
+
+(2) SX-latch:
+
+In this case rules (1.2) and (1.3) from S-latch case are relaxed and
+merged into (2.2) and rule (1.4) is removed. Thus, latch acquisition
+can be skipped at some tree levels and latches can be obtained in
+a less restricted order.
+
+(2.1) [identical to (1.1)]: All latches for pages must be obtained in descending
+order of tree level.
+
+(2.2) When a node pointer latch at level L is obtained,
+the left sibling page latch in the same level or some ancestor
+page latch (at level > L) must be hold.
+
+(2.3) [implied by (2.1), (2.2)] The first node pointer page latch obtained can
+be any node pointer page.
+
+(3) X-latch:
+
+Node pointer latches can be obtained in any order.
+
+NOTE: New rules after MariaDB 10.2.2 does not affect the latching rules of leaf pages:
+
+index->lock S-latch is needed in read for the node pointer traversal. When the leaf
+level is reached, index-lock can be released (and with the MariaDB 10.2.2 changes, all
+node pointer latches). Left to right index travelsal in leaf page level can be safely done
+by obtaining right sibling leaf page latch and then releasing the old page latch.
+
+Single leaf page modifications (BTR_MODIFY_LEAF) are protected by index->lock
+S-latch.
+
+B-tree operations involving page splits or merges (BTR_MODIFY_TREE) and page
+allocations are protected by index->lock X-latch.
+
+Node pointers
+-------------
+Leaf pages of a B-tree contain the index records stored in the
+tree. On levels n > 0 we store 'node pointers' to pages on level
+n - 1. For each page there is exactly one node pointer stored:
+thus the our tree is an ordinary B-tree, not a B-link tree.
+
+A node pointer contains a prefix P of an index record. The prefix
+is long enough so that it determines an index record uniquely.
+The file page number of the child page is added as the last
+field. To the child page we can store node pointers or index records
+which are >= P in the alphabetical order, but < P1 if there is
+a next node pointer on the level, and P1 is its prefix.
+
+If a node pointer with a prefix P points to a non-leaf child,
+then the leftmost record in the child must have the same
+prefix P. If it points to a leaf node, the child is not required
+to contain any record with a prefix equal to P. The leaf case
+is decided this way to allow arbitrary deletions in a leaf node
+without touching upper levels of the tree.
+
+We have predefined a special minimum record which we
+define as the smallest record in any alphabetical order.
+A minimum record is denoted by setting a bit in the record
+header. A minimum record acts as the prefix of a node pointer
+which points to a leftmost node on any level of the tree.
+
+File page allocation
+--------------------
+In the root node of a B-tree there are two file segment headers.
+The leaf pages of a tree are allocated from one file segment, to
+make them consecutive on disk if possible. From the other file segment
+we allocate pages for the non-leaf levels of the tree.
+*/
+
+#ifdef UNIV_BTR_DEBUG
+/**************************************************************//**
+Checks a file segment header within a B-tree root page.
+@return TRUE if valid */
+static
+ibool
+btr_root_fseg_validate(
+/*===================*/
+	const fseg_header_t*	seg_header,	/*!< in: segment header */
+	ulint			space)		/*!< in: tablespace identifier */
+{
+	ulint	offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
+
+	ut_a(mach_read_from_4(seg_header + FSEG_HDR_SPACE) == space);
+	ut_a(offset >= FIL_PAGE_DATA);
+	ut_a(offset <= srv_page_size - FIL_PAGE_DATA_END);
+	return(TRUE);
+}
+#endif /* UNIV_BTR_DEBUG */
+
+/**************************************************************//**
+Gets the root node of a tree and x- or s-latches it.
+@return root page, x- or s-latched */
+buf_block_t*
+btr_root_block_get(
+/*===============*/
+	const dict_index_t*	index,	/*!< in: index tree */
+	rw_lock_type_t		mode,	/*!< in: either RW_S_LATCH
+					or RW_X_LATCH */
+	mtr_t*			mtr)	/*!< in: mtr */
+{
+	if (!index->table || !index->table->space) {
+		return NULL;
+	}
+
+	buf_block_t* block = btr_block_get(*index, index->page, mode, false,
+					   mtr);
+
+	if (!block) {
+		index->table->file_unreadable = true;
+
+		ib_push_warning(
+			static_cast<THD*>(NULL), DB_DECRYPTION_FAILED,
+			"Table %s in file %s is encrypted but encryption service or"
+			" used key_id is not available. "
+			" Can't continue reading table.",
+			index->table->name.m_name,
+			UT_LIST_GET_FIRST(index->table->space->chain)->name);
+
+		return NULL;
+	}
+
+	btr_assert_not_corrupted(block, index);
+
+#ifdef UNIV_BTR_DEBUG
+	if (!dict_index_is_ibuf(index)) {
+		const page_t*	root = buf_block_get_frame(block);
+
+		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+					    + root, index->table->space_id));
+		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+					    + root, index->table->space_id));
+	}
+#endif /* UNIV_BTR_DEBUG */
+
+	return(block);
+}
+
+/**************************************************************//**
+Gets the root node of a tree and sx-latches it for segment access.
+@return root page, sx-latched */
+page_t*
+btr_root_get(
+/*=========*/
+	const dict_index_t*	index,	/*!< in: index tree */
+	mtr_t*			mtr)	/*!< in: mtr */
+{
+	/* Intended to be used for segment list access.
+	SX lock doesn't block reading user data by other threads.
+	And block the segment list access by others.*/
+	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
+	return(root ? buf_block_get_frame(root) : NULL);
+}
+
+/**************************************************************//**
+Gets the height of the B-tree (the level of the root, when the leaf
+level is assumed to be 0). The caller must hold an S or X latch on
+the index.
+@return tree height (level of the root) */
+ulint
+btr_height_get(
+/*===========*/
+	const dict_index_t*	index,	/*!< in: index tree */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint		height=0;
+	buf_block_t*	root_block;
+
+	ut_ad(srv_read_only_mode
+	      || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_S_LOCK
+					    | MTR_MEMO_X_LOCK
+					    | MTR_MEMO_SX_LOCK));
+
+	/* S latches the page */
+	root_block = btr_root_block_get(index, RW_S_LATCH, mtr);
+
+	if (root_block) {
+		height = btr_page_get_level(buf_block_get_frame(root_block));
+
+		/* Release the S latch on the root page. */
+		mtr->memo_release(root_block, MTR_MEMO_PAGE_S_FIX);
+
+		ut_d(sync_check_unlock(&root_block->lock));
+	}
+
+	return(height);
+}
+
+/**************************************************************//**
+Checks a file segment header within a B-tree root page and updates
+the segment header space id.
+@return TRUE if valid */
+static
+bool
+btr_root_fseg_adjust_on_import(
+/*===========================*/
+	fseg_header_t*	seg_header,	/*!< in/out: segment header */
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page,
+					or NULL */
+	ulint		space)		/*!< in: tablespace identifier */
+{
+	ulint	offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
+
+	if (offset < FIL_PAGE_DATA
+	    || offset > srv_page_size - FIL_PAGE_DATA_END) {
+		return false;
+	}
+
+	seg_header += FSEG_HDR_SPACE;
+
+	mach_write_to_4(seg_header, space);
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		memcpy(page_zip->data + page_offset(seg_header), seg_header,
+		       4);
+	}
+
+	return true;
+}
+
+/**************************************************************//**
+Checks and adjusts the root node of a tree during IMPORT TABLESPACE.
+@return error code, or DB_SUCCESS */
+dberr_t
+btr_root_adjust_on_import(
+/*======================*/
+	const dict_index_t*	index)	/*!< in: index tree */
+{
+	dberr_t			err;
+	mtr_t			mtr;
+	page_t*			page;
+	page_zip_des_t*		page_zip;
+	dict_table_t*		table = index->table;
+
+	DBUG_EXECUTE_IF("ib_import_trigger_corruption_3",
+			return(DB_CORRUPTION););
+
+	mtr_start(&mtr);
+
+	mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
+
+	buf_block_t* block = buf_page_get_gen(
+		page_id_t(table->space->id, index->page),
+		table->space->zip_size(), RW_X_LATCH, NULL, BUF_GET,
+		__FILE__, __LINE__,
+		&mtr, &err);
+	if (!block) {
+		ut_ad(err != DB_SUCCESS);
+		goto func_exit;
+	}
+
+	buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+
+	page = buf_block_get_frame(block);
+	page_zip = buf_block_get_page_zip(block);
+
+	if (!fil_page_index_page_check(page) || page_has_siblings(page)) {
+		err = DB_CORRUPTION;
+
+	} else if (dict_index_is_clust(index)) {
+		bool	page_is_compact_format;
+
+		page_is_compact_format = page_is_comp(page) > 0;
+
+		/* Check if the page format and table format agree. */
+		if (page_is_compact_format != dict_table_is_comp(table)) {
+			err = DB_CORRUPTION;
+		} else {
+			/* Check that the table flags and the tablespace
+			flags match. */
+			ulint tf = dict_tf_to_fsp_flags(table->flags);
+			ulint sf = table->space->flags;
+			sf &= ~FSP_FLAGS_MEM_MASK;
+			tf &= ~FSP_FLAGS_MEM_MASK;
+			if (fil_space_t::is_flags_equal(tf, sf)
+			    || fil_space_t::is_flags_equal(sf, tf)) {
+				mutex_enter(&fil_system.mutex);
+				table->space->flags = (table->space->flags
+						       & ~FSP_FLAGS_MEM_MASK)
+					| (tf & FSP_FLAGS_MEM_MASK);
+				mutex_exit(&fil_system.mutex);
+				err = DB_SUCCESS;
+			} else {
+				err = DB_CORRUPTION;
+			}
+		}
+	} else {
+		err = DB_SUCCESS;
+	}
+
+	/* Check and adjust the file segment headers, if all OK so far. */
+	if (err == DB_SUCCESS
+	    && (!btr_root_fseg_adjust_on_import(
+			FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+			+ page, page_zip, table->space_id)
+		|| !btr_root_fseg_adjust_on_import(
+			FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+			+ page, page_zip, table->space_id))) {
+
+		err = DB_CORRUPTION;
+	}
+
+func_exit:
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/**************************************************************//**
+Creates a new index page (not the root, and also not
+used in page reorganization).  @see btr_page_empty(). */
+void
+btr_page_create(
+/*============*/
+	buf_block_t*	block,	/*!< in/out: page to be created */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: the B-tree level of the page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+  byte *index_id= my_assume_aligned<2>(PAGE_HEADER + PAGE_INDEX_ID +
+                                       block->frame);
+
+  if (UNIV_LIKELY_NULL(page_zip))
+  {
+    mach_write_to_8(index_id, index->id);
+    page_create_zip(block, index, level, 0, mtr);
+  }
+  else
+  {
+    page_create(block, mtr, dict_table_is_comp(index->table));
+    if (index->is_spatial())
+    {
+      static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) ==
+                    FIL_PAGE_RTREE, "compatibility");
+      mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame,
+                    byte(FIL_PAGE_RTREE));
+      if (mach_read_from_8(block->frame + FIL_RTREE_SPLIT_SEQ_NUM))
+        mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, 8, 0);
+    }
+    /* Set the level of the new index page */
+    mtr->write<2,mtr_t::MAYBE_NOP>(*block,
+                                   my_assume_aligned<2>(PAGE_HEADER +
+                                                        PAGE_LEVEL +
+                                                        block->frame), level);
+    mtr->write<8,mtr_t::MAYBE_NOP>(*block, index_id, index->id);
+  }
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an ibuf tree. Takes the page from
+the free list of the tree, which must contain pages!
+@return new allocated block, x-latched */
+static
+buf_block_t*
+btr_page_alloc_for_ibuf(
+/*====================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	buf_block_t*	new_block;
+
+	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
+
+	fil_addr_t node_addr = flst_get_first(PAGE_HEADER
+					      + PAGE_BTR_IBUF_FREE_LIST
+					      + root->frame);
+	ut_a(node_addr.page != FIL_NULL);
+
+	new_block = buf_page_get(
+		page_id_t(index->table->space_id, node_addr.page),
+		index->table->space->zip_size(),
+		RW_X_LATCH, mtr);
+
+	buf_block_dbg_add_level(new_block, SYNC_IBUF_TREE_NODE_NEW);
+
+	flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+		    new_block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
+		    mtr);
+	ut_d(flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
+
+	return(new_block);
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@retval NULL if no page could be allocated */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+buf_block_t*
+btr_page_alloc_low(
+/*===============*/
+	dict_index_t*	index,		/*!< in: index */
+	uint32_t	hint_page_no,	/*!< in: hint of a good page */
+	byte		file_direction,	/*!< in: direction where a possible
+					page split is made */
+	ulint		level,		/*!< in: level where the page is placed
+					in the tree */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction
+					for the allocation */
+	mtr_t*		init_mtr)	/*!< in/out: mtr or another
+					mini-transaction in which the
+					page should be initialized. */
+{
+	page_t* root = btr_root_get(index, mtr);
+
+	fseg_header_t* seg_header = (level
+				     ? PAGE_HEADER + PAGE_BTR_SEG_TOP
+				     : PAGE_HEADER + PAGE_BTR_SEG_LEAF)
+		+ root;
+
+	/* Parameter TRUE below states that the caller has made the
+	reservation for free extents, and thus we know that a page can
+	be allocated: */
+
+	buf_block_t* block = fseg_alloc_free_page_general(
+		seg_header, hint_page_no, file_direction,
+		true, mtr, init_mtr);
+
+	return block;
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@retval NULL if no page could be allocated */
+buf_block_t*
+btr_page_alloc(
+/*===========*/
+	dict_index_t*	index,		/*!< in: index */
+	uint32_t	hint_page_no,	/*!< in: hint of a good page */
+	byte		file_direction,	/*!< in: direction where a possible
+					page split is made */
+	ulint		level,		/*!< in: level where the page is placed
+					in the tree */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction
+					for the allocation */
+	mtr_t*		init_mtr)	/*!< in/out: mini-transaction
+					for x-latching and initializing
+					the page */
+{
+	buf_block_t*	new_block;
+
+	if (dict_index_is_ibuf(index)) {
+
+		return(btr_page_alloc_for_ibuf(index, mtr));
+	}
+
+	new_block = btr_page_alloc_low(
+		index, hint_page_no, file_direction, level, mtr, init_mtr);
+
+	if (new_block) {
+		buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
+	}
+
+	return(new_block);
+}
+
+/**************************************************************//**
+Gets the number of pages in a B-tree.
+@return number of pages, or ULINT_UNDEFINED if the index is unavailable */
+ulint
+btr_get_size(
+/*=========*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint		flag,	/*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction where index
+				is s-latched */
+{
+	ulint		n=0;
+
+	ut_ad(srv_read_only_mode
+	      || mtr->memo_contains(index->lock, MTR_MEMO_S_LOCK));
+	ut_ad(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
+
+	if (index->page == FIL_NULL
+	    || dict_index_is_online_ddl(index)
+	    || !index->is_committed()
+	    || !index->table->space) {
+		return(ULINT_UNDEFINED);
+	}
+
+	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
+	if (!root) {
+		return ULINT_UNDEFINED;
+	}
+	mtr_x_lock_space(index->table->space, mtr);
+	if (flag == BTR_N_LEAF_PAGES) {
+		fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF
+				      + root->frame, &n, mtr);
+	} else {
+		ulint dummy;
+		n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_TOP
+					  + root->frame, &dummy, mtr);
+		n += fseg_n_reserved_pages(*root,
+					   PAGE_HEADER + PAGE_BTR_SEG_LEAF
+					   + root->frame, &dummy, mtr);
+	}
+
+	return(n);
+}
+
+/**************************************************************//**
+Gets the number of reserved and used pages in a B-tree.
+@return	number of pages reserved, or ULINT_UNDEFINED if the index
+is unavailable */
+UNIV_INTERN
+ulint
+btr_get_size_and_reserved(
+/*======================*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		flag,	/*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+	ulint*		used,	/*!< out: number of pages used (<= reserved) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction where index
+				is s-latched */
+{
+	ulint		dummy;
+
+	ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_S_LOCK));
+	ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
+
+	if (index->page == FIL_NULL
+	    || dict_index_is_online_ddl(index)
+	    || !index->is_committed()
+	    || !index->table->space) {
+		return(ULINT_UNDEFINED);
+	}
+
+	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
+	*used = 0;
+	if (!root) {
+		return ULINT_UNDEFINED;
+	}
+
+	mtr_x_lock_space(index->table->space, mtr);
+
+	ulint n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF
+					+ root->frame, used, mtr);
+	if (flag == BTR_TOTAL_SIZE) {
+		n += fseg_n_reserved_pages(*root,
+					   PAGE_HEADER + PAGE_BTR_SEG_TOP
+					   + root->frame, &dummy, mtr);
+		*used += dummy;
+	}
+
+	return(n);
+}
+
+/**************************************************************//**
+Frees a page used in an ibuf tree. Puts the page to the free list of the
+ibuf tree. */
+static
+void
+btr_page_free_for_ibuf(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+
+	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
+
+	flst_add_first(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+		       block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
+
+	ut_d(flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
+}
+
+/** Free an index page.
+@param[in,out]	index	index tree
+@param[in,out]	block	block to be freed
+@param[in,out]	mtr	mini-transaction
+@param[in]	blob	whether this is freeing a BLOB page */
+void btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
+		   bool blob)
+{
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+#ifdef BTR_CUR_HASH_ADAPT
+	if (block->index && !block->index->freed()) {
+		ut_ad(!blob);
+		ut_ad(page_is_leaf(block->frame));
+	}
+#endif
+	const page_id_t id(block->page.id());
+	ut_ad(index->table->space_id == id.space());
+	/* The root page is freed by btr_free_root(). */
+	ut_ad(id.page_no() != index->page);
+	ut_ad(mtr->is_named_space(index->table->space));
+
+	/* The page gets invalid for optimistic searches: increment the frame
+	modify clock */
+
+	buf_block_modify_clock_inc(block);
+
+	if (dict_index_is_ibuf(index)) {
+		btr_page_free_for_ibuf(index, block, mtr);
+		return;
+	}
+
+	/* TODO: Discard any operations for block from mtr->log.
+	The page will be freed, so previous changes to it by this
+	mini-transaction should not matter. */
+	page_t* root = btr_root_get(index, mtr);
+	fseg_header_t* seg_header = &root[blob || page_is_leaf(block->frame)
+					  ? PAGE_HEADER + PAGE_BTR_SEG_LEAF
+					  : PAGE_HEADER + PAGE_BTR_SEG_TOP];
+	fil_space_t* space= index->table->space;
+	const uint32_t page= id.page_no();
+
+	fseg_free_page(seg_header, space, page, mtr);
+	buf_page_free(space, page, mtr, __FILE__, __LINE__);
+
+	/* The page was marked free in the allocation bitmap, but it
+	should remain exclusively latched until mtr_t::commit() or until it
+	is explicitly freed from the mini-transaction. */
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+}
+
+/** Set the child page number in a node pointer record.
+@param[in,out]  block   non-leaf index page
+@param[in,out]  rec     node pointer record in the page
+@param[in]      offsets rec_get_offsets(rec)
+@param[in]      page_no child page number
+@param[in,out]  mtr     mini-transaction
+Sets the child node file address in a node pointer. */
+inline void btr_node_ptr_set_child_page_no(buf_block_t *block,
+                                           rec_t *rec, const rec_offs *offsets,
+                                           ulint page_no, mtr_t *mtr)
+{
+  ut_ad(rec_offs_validate(rec, NULL, offsets));
+  ut_ad(!page_rec_is_leaf(rec));
+  ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
+
+  const ulint offs= rec_offs_data_size(offsets);
+  ut_ad(rec_offs_nth_size(offsets, rec_offs_n_fields(offsets) - 1) ==
+        REC_NODE_PTR_SIZE);
+
+  if (UNIV_LIKELY_NULL(block->page.zip.data))
+    page_zip_write_node_ptr(block, rec, offs, page_no, mtr);
+  else
+    mtr->write<4>(*block, rec + offs - REC_NODE_PTR_SIZE, page_no);
+}
+
+/************************************************************//**
+Returns the child page of a node pointer and sx-latches it.
+@return child page, sx-latched */
+static
+buf_block_t*
+btr_node_ptr_get_child(
+/*===================*/
+	const rec_t*	node_ptr,/*!< in: node pointer */
+	dict_index_t*	index,	/*!< in: index */
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(rec_offs_validate(node_ptr, index, offsets));
+	ut_ad(index->table->space_id
+	      == page_get_space_id(page_align(node_ptr)));
+
+	return btr_block_get(
+		*index, btr_node_ptr_get_child_page_no(node_ptr, offsets),
+		RW_SX_LATCH, btr_page_get_level(page_align(node_ptr)) == 1,
+		mtr);
+}
+
+/************************************************************//**
+Returns the upper level node pointer to a page. It is assumed that mtr holds
+an sx-latch on the tree.
+@return rec_get_offsets() of the node pointer record */
+static
+rec_offs*
+btr_page_get_father_node_ptr_func(
+/*==============================*/
+	rec_offs*	offsets,/*!< in: work area for the return value */
+	mem_heap_t*	heap,	/*!< in: memory heap to use */
+	btr_cur_t*	cursor,	/*!< in: cursor pointing to user record,
+				out: cursor on node pointer record,
+				its page x-latched */
+	ulint		latch_mode,/*!< in: BTR_CONT_MODIFY_TREE
+				or BTR_CONT_SEARCH_TREE */
+	const char*	file,	/*!< in: file name */
+	unsigned	line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dtuple_t*	tuple;
+	rec_t*		user_rec;
+	rec_t*		node_ptr;
+	ulint		level;
+	ulint		page_no;
+	dict_index_t*	index;
+
+	ut_ad(latch_mode == BTR_CONT_MODIFY_TREE
+	      || latch_mode == BTR_CONT_SEARCH_TREE);
+
+	page_no = btr_cur_get_block(cursor)->page.id().page_no();
+	index = btr_cur_get_index(cursor);
+	ut_ad(!dict_index_is_spatial(index));
+
+	ut_ad(srv_read_only_mode
+	      || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					    | MTR_MEMO_SX_LOCK));
+
+	ut_ad(dict_index_get_page(index) != page_no);
+
+	level = btr_page_get_level(btr_cur_get_page(cursor));
+
+	user_rec = btr_cur_get_rec(cursor);
+	ut_a(page_rec_is_user_rec(user_rec));
+
+	tuple = dict_index_build_node_ptr(index, user_rec, 0, heap, level);
+	dberr_t err = DB_SUCCESS;
+
+	err = btr_cur_search_to_nth_level(
+		index, level + 1, tuple,
+		PAGE_CUR_LE, latch_mode, cursor, 0,
+		file, line, mtr);
+
+	if (err != DB_SUCCESS) {
+		ib::warn() << " Error code: " << err
+			<< " btr_page_get_father_node_ptr_func "
+			<< " level: " << level + 1
+			<< " called from file: "
+			<< file << " line: " << line
+			<< " table: " << index->table->name
+			<< " index: " << index->name();
+	}
+
+	node_ptr = btr_cur_get_rec(cursor);
+
+	offsets = rec_get_offsets(node_ptr, index, offsets, 0,
+				  ULINT_UNDEFINED, &heap);
+
+	if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) {
+		rec_t*	print_rec;
+
+		ib::error()
+			<< "Corruption of an index tree: table "
+			<< index->table->name
+			<< " index " << index->name
+			<< ", father ptr page no "
+			<< btr_node_ptr_get_child_page_no(node_ptr, offsets)
+			<< ", child page no " << page_no;
+
+		print_rec = page_rec_get_next(
+			page_get_infimum_rec(page_align(user_rec)));
+		offsets = rec_get_offsets(print_rec, index, offsets,
+					  page_rec_is_leaf(user_rec)
+					  ? index->n_core_fields : 0,
+					  ULINT_UNDEFINED, &heap);
+		page_rec_print(print_rec, offsets);
+		offsets = rec_get_offsets(node_ptr, index, offsets, 0,
+					  ULINT_UNDEFINED, &heap);
+		page_rec_print(node_ptr, offsets);
+
+		ib::fatal()
+			<< "You should dump + drop + reimport the table to"
+			<< " fix the corruption. If the crash happens at"
+			<< " database startup. " << FORCE_RECOVERY_MSG
+			<< " Then dump + drop + reimport.";
+	}
+
+	return(offsets);
+}
+
+#define btr_page_get_father_node_ptr(of,heap,cur,mtr)			\
+	btr_page_get_father_node_ptr_func(				\
+		of,heap,cur,BTR_CONT_MODIFY_TREE,__FILE__,__LINE__,mtr)
+
+#define btr_page_get_father_node_ptr_for_validate(of,heap,cur,mtr)	\
+	btr_page_get_father_node_ptr_func(				\
+		of,heap,cur,BTR_CONT_SEARCH_TREE,__FILE__,__LINE__,mtr)
+
+/************************************************************//**
+Returns the upper level node pointer to a page. It is assumed that mtr holds
+an x-latch on the tree.
+@return rec_get_offsets() of the node pointer record */
+static
+rec_offs*
+btr_page_get_father_block(
+/*======================*/
+	rec_offs*	offsets,/*!< in: work area for the return value */
+	mem_heap_t*	heap,	/*!< in: memory heap to use */
+	dict_index_t*	index,	/*!< in: b-tree index */
+	buf_block_t*	block,	/*!< in: child page in the index */
+	mtr_t*		mtr,	/*!< in: mtr */
+	btr_cur_t*	cursor)	/*!< out: cursor on node pointer record,
+				its page x-latched */
+{
+	rec_t*	rec
+		= page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
+								 block)));
+	btr_cur_position(index, rec, block, cursor);
+	return(btr_page_get_father_node_ptr(offsets, heap, cursor, mtr));
+}
+
+/** Seek to the parent page of a B-tree page.
+@param[in,out]	index	b-tree
+@param[in]	block	child page
+@param[in,out]	mtr	mini-transaction
+@param[out]	cursor	cursor pointing to the x-latched parent page */
+void btr_page_get_father(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
+			 btr_cur_t* cursor)
+{
+	mem_heap_t*	heap;
+	rec_t*		rec
+		= page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
+								 block)));
+	btr_cur_position(index, rec, block, cursor);
+
+	heap = mem_heap_create(100);
+	btr_page_get_father_node_ptr(NULL, heap, cursor, mtr);
+	mem_heap_free(heap);
+}
+
+#ifdef UNIV_DEBUG
+/** PAGE_INDEX_ID value for freed index B-trees */
+constexpr index_id_t	BTR_FREED_INDEX_ID = 0;
+#endif
+
+/** Free a B-tree root page. btr_free_but_not_root() must already
+have been called.
+In a persistent tablespace, the caller must invoke fsp_init_file_page()
+before mtr.commit().
+@param[in,out]	block		index root page
+@param[in,out]	mtr		mini-transaction */
+static void btr_free_root(buf_block_t *block, mtr_t *mtr)
+{
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(mtr->is_named_space(block->page.id().space()));
+
+  btr_search_drop_page_hash_index(block);
+
+#ifdef UNIV_BTR_DEBUG
+  ut_a(btr_root_fseg_validate(PAGE_HEADER + PAGE_BTR_SEG_TOP + block->frame,
+			      block->page.id().space()));
+#endif /* UNIV_BTR_DEBUG */
+
+  /* Free the entire segment in small steps. */
+  while (!fseg_free_step(PAGE_HEADER + PAGE_BTR_SEG_TOP + block->frame, mtr));
+}
+
+/** Prepare to free a B-tree.
+@param[in]	page_id		page id
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	index_id	PAGE_INDEX_ID contents
+@param[in,out]	mtr		mini-transaction
+@return root block, to invoke btr_free_but_not_root() and btr_free_root()
+@retval NULL if the page is no longer a matching B-tree page */
+static MY_ATTRIBUTE((warn_unused_result))
+buf_block_t*
+btr_free_root_check(
+	const page_id_t		page_id,
+	ulint			zip_size,
+	index_id_t		index_id,
+	mtr_t*			mtr)
+{
+	ut_ad(page_id.space() != SRV_TMP_SPACE_ID);
+	ut_ad(index_id != BTR_FREED_INDEX_ID);
+
+	buf_block_t*	block = buf_page_get(
+		page_id, zip_size, RW_X_LATCH, mtr);
+
+	if (block) {
+		buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+
+		if (fil_page_index_page_check(block->frame)
+		    && index_id == btr_page_get_index_id(block->frame)) {
+			/* This should be a root page.
+			It should not be possible to reassign the same
+			index_id for some other index in the tablespace. */
+			ut_ad(!page_has_siblings(block->frame));
+		} else {
+			block = NULL;
+		}
+	}
+
+	return(block);
+}
+
+/** Create the root node for a new index tree.
+@param[in]	type			type of the index
+@param[in]	index_id		index id
+@param[in,out]	space			tablespace where created
+@param[in]	index			index, or NULL to create a system table
+@param[in,out]	mtr			mini-transaction
+@return	page number of the created root
+@retval	FIL_NULL	if did not succeed */
+uint32_t
+btr_create(
+	ulint			type,
+	fil_space_t*		space,
+	index_id_t		index_id,
+	dict_index_t*		index,
+	mtr_t*			mtr)
+{
+	buf_block_t*		block;
+
+	ut_ad(mtr->is_named_space(space));
+	ut_ad(index_id != BTR_FREED_INDEX_ID);
+
+	/* Create the two new segments (one, in the case of an ibuf tree) for
+	the index tree; the segment headers are put on the allocated root page
+	(for an ibuf tree, not in the root, but on a separate ibuf header
+	page) */
+
+	if (UNIV_UNLIKELY(type & DICT_IBUF)) {
+		/* Allocate first the ibuf header page */
+		buf_block_t*	ibuf_hdr_block = fseg_create(
+			space, IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr);
+
+		if (ibuf_hdr_block == NULL) {
+			return(FIL_NULL);
+		}
+
+		buf_block_dbg_add_level(
+			ibuf_hdr_block, SYNC_IBUF_TREE_NODE_NEW);
+
+		ut_ad(ibuf_hdr_block->page.id().page_no()
+		      == IBUF_HEADER_PAGE_NO);
+		/* Allocate then the next page to the segment: it will be the
+		tree root page */
+
+		block = fseg_alloc_free_page(
+			buf_block_get_frame(ibuf_hdr_block)
+			+ IBUF_HEADER + IBUF_TREE_SEG_HEADER,
+			IBUF_TREE_ROOT_PAGE_NO,
+			FSP_UP, mtr);
+
+		if (block == NULL) {
+			return(FIL_NULL);
+		}
+
+		ut_ad(block->page.id() == page_id_t(0,IBUF_TREE_ROOT_PAGE_NO));
+
+		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);
+
+		flst_init(block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr);
+	} else {
+		block = fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_TOP,
+				    mtr);
+
+		if (block == NULL) {
+			return(FIL_NULL);
+		}
+
+		buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
+
+		if (!fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr,
+				 false, block)) {
+			/* Not enough space for new segment, free root
+			segment before return. */
+			btr_free_root(block, mtr);
+			return(FIL_NULL);
+		}
+
+		/* The fseg create acquires a second latch on the page,
+		therefore we must declare it: */
+		buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
+	}
+
+	ut_ad(!page_has_siblings(block->frame));
+
+	constexpr uint16_t field = PAGE_HEADER + PAGE_INDEX_ID;
+
+	byte* page_index_id = my_assume_aligned<2>(field + block->frame);
+
+	/* Create a new index page on the allocated segment page */
+	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+		mach_write_to_8(page_index_id, index_id);
+		ut_ad(!page_has_siblings(block->page.zip.data));
+		page_create_zip(block, index, 0, 0, mtr);
+	} else {
+		page_create(block, mtr,
+			    index && index->table->not_redundant());
+		if (index && index->is_spatial()) {
+			static_assert(((FIL_PAGE_INDEX & 0xff00)
+				       | byte(FIL_PAGE_RTREE))
+				      == FIL_PAGE_RTREE, "compatibility");
+			mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame,
+				      byte(FIL_PAGE_RTREE));
+			if (mach_read_from_8(block->frame
+					     + FIL_RTREE_SPLIT_SEQ_NUM)) {
+				mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM,
+					    8, 0);
+			}
+		}
+		/* Set the level of the new index page */
+		mtr->write<2,mtr_t::MAYBE_NOP>(*block, PAGE_HEADER + PAGE_LEVEL
+					       + block->frame, 0U);
+		mtr->write<8,mtr_t::MAYBE_NOP>(*block, page_index_id,
+					       index_id);
+	}
+
+	/* We reset the free bits for the page in a separate
+	mini-transaction to allow creation of several trees in the
+	same mtr, otherwise the latch on a bitmap page would prevent
+	it because of the latching order.
+
+	Note: Insert Buffering is disabled for temporary tables given that
+	most temporary tables are smaller in size and short-lived. */
+	if (!(type & DICT_CLUSTERED)
+	    && (!index || !index->table->is_temporary())) {
+		ibuf_reset_free_bits(block);
+	}
+
+	/* In the following assertion we test that two records of maximum
+	allowed size fit on the root page: this fact is needed to ensure
+	correctness of split algorithms */
+
+	ut_ad(page_get_max_insert_size(block->frame, 2)
+	      > 2 * BTR_PAGE_MAX_REC_SIZE);
+
+	return(block->page.id().page_no());
+}
+
+/** Free a B-tree except the root page. The root page MUST be freed after
+this by calling btr_free_root.
+@param[in,out]	block		root page
+@param[in]	log_mode	mtr logging mode */
+static
+void
+btr_free_but_not_root(
+	buf_block_t*	block,
+	mtr_log_t	log_mode)
+{
+	mtr_t	mtr;
+
+	ut_ad(fil_page_index_page_check(block->frame));
+	ut_ad(!page_has_siblings(block->frame));
+leaf_loop:
+	mtr_start(&mtr);
+	mtr_set_log_mode(&mtr, log_mode);
+	mtr.set_named_space_id(block->page.id().space());
+
+	page_t*	root = block->frame;
+
+	if (!root) {
+		mtr_commit(&mtr);
+		return;
+	}
+
+#ifdef UNIV_BTR_DEBUG
+	ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+				    + root, block->page.id().space()));
+	ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+				    + root, block->page.id().space()));
+#endif /* UNIV_BTR_DEBUG */
+
+	/* NOTE: page hash indexes are dropped when a page is freed inside
+	fsp0fsp. */
+
+	bool finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF,
+				       &mtr);
+	mtr_commit(&mtr);
+
+	if (!finished) {
+
+		goto leaf_loop;
+	}
+top_loop:
+	mtr_start(&mtr);
+	mtr_set_log_mode(&mtr, log_mode);
+	mtr.set_named_space_id(block->page.id().space());
+
+	root = block->frame;
+
+#ifdef UNIV_BTR_DEBUG
+	ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+				    + root, block->page.id().space()));
+#endif /* UNIV_BTR_DEBUG */
+
+	finished = fseg_free_step_not_header(
+		root + PAGE_HEADER + PAGE_BTR_SEG_TOP, &mtr);
+	mtr_commit(&mtr);
+
+	if (!finished) {
+		goto top_loop;
+	}
+}
+
+/** Free a persistent index tree if it exists.
+@param[in]	page_id		root page id
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	index_id	PAGE_INDEX_ID contents
+@param[in,out]	mtr		mini-transaction */
+void
+btr_free_if_exists(
+	const page_id_t		page_id,
+	ulint			zip_size,
+	index_id_t		index_id,
+	mtr_t*			mtr)
+{
+	buf_block_t* root = btr_free_root_check(
+		page_id, zip_size, index_id, mtr);
+
+	if (root == NULL) {
+		return;
+	}
+
+	btr_free_but_not_root(root, mtr->get_log_mode());
+	mtr->set_named_space_id(page_id.space());
+	btr_free_root(root, mtr);
+}
+
+/** Free an index tree in a temporary tablespace.
+@param[in]	page_id		root page id */
+void btr_free(const page_id_t page_id)
+{
+	mtr_t		mtr;
+	mtr.start();
+	mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+	buf_block_t*	block = buf_page_get(page_id, 0, RW_X_LATCH, &mtr);
+
+	if (block) {
+		btr_free_but_not_root(block, MTR_LOG_NO_REDO);
+		btr_free_root(block, &mtr);
+	}
+	mtr.commit();
+}
+
+/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC.
+@param[in,out]	index	clustered index
+@return	the last used AUTO_INCREMENT value
+@retval	0 on error or if no AUTO_INCREMENT value was used yet */
+ib_uint64_t
+btr_read_autoinc(dict_index_t* index)
+{
+	ut_ad(index->is_primary());
+	ut_ad(index->table->persistent_autoinc);
+	ut_ad(!index->table->is_temporary());
+	mtr_t		mtr;
+	mtr.start();
+	ib_uint64_t	autoinc;
+	if (buf_block_t* block = buf_page_get(
+		    page_id_t(index->table->space_id, index->page),
+		    index->table->space->zip_size(),
+		    RW_S_LATCH, &mtr)) {
+		autoinc = page_get_autoinc(block->frame);
+	} else {
+		autoinc = 0;
+	}
+	mtr.commit();
+	return autoinc;
+}
+
+/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC,
+or fall back to MAX(auto_increment_column).
+@param[in]	table	table containing an AUTO_INCREMENT column
+@param[in]	col_no	index of the AUTO_INCREMENT column
+@return	the AUTO_INCREMENT value
+@retval	0 on error or if no AUTO_INCREMENT value was used yet */
+ib_uint64_t
+btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no)
+{
+	ut_ad(table->persistent_autoinc);
+	ut_ad(!table->is_temporary());
+
+	dict_index_t*	index = dict_table_get_first_index(table);
+
+	if (index == NULL) {
+		return 0;
+	}
+
+	mtr_t		mtr;
+	mtr.start();
+	buf_block_t*	block = buf_page_get(
+		page_id_t(index->table->space_id, index->page),
+		index->table->space->zip_size(),
+		RW_S_LATCH, &mtr);
+
+	ib_uint64_t	autoinc	= block ? page_get_autoinc(block->frame) : 0;
+	const bool	retry	= block && autoinc == 0
+		&& !page_is_empty(block->frame);
+	mtr.commit();
+
+	if (retry) {
+		/* This should be an old data file where
+		PAGE_ROOT_AUTO_INC was initialized to 0.
+		Fall back to reading MAX(autoinc_col).
+		There should be an index on it. */
+		const dict_col_t*	autoinc_col
+			= dict_table_get_nth_col(table, col_no);
+		while (index && index->fields[0].col != autoinc_col) {
+			index = dict_table_get_next_index(index);
+		}
+
+		if (index) {
+			autoinc = row_search_max_autoinc(index);
+		}
+	}
+
+	return autoinc;
+}
+
+/** Write the next available AUTO_INCREMENT value to PAGE_ROOT_AUTO_INC.
+@param[in,out]	index	clustered index
+@param[in]	autoinc	the AUTO_INCREMENT value
+@param[in]	reset	whether to reset the AUTO_INCREMENT
+			to a possibly smaller value than currently
+			exists in the page */
+void
+btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset)
+{
+	ut_ad(index->is_primary());
+	ut_ad(index->table->persistent_autoinc);
+	ut_ad(!index->table->is_temporary());
+
+	mtr_t		mtr;
+	mtr.start();
+	fil_space_t* space = index->table->space;
+	mtr.set_named_space(space);
+	page_set_autoinc(buf_page_get(page_id_t(space->id, index->page),
+				      space->zip_size(),
+				      RW_SX_LATCH, &mtr),
+			 autoinc, &mtr, reset);
+	mtr.commit();
+}
+
+/** Reorganize an index page.
+@param cursor      index page cursor
+@param index       the index that the cursor belongs to
+@param mtr         mini-transaction */
+static void btr_page_reorganize_low(page_cur_t *cursor, dict_index_t *index,
+                                    mtr_t *mtr)
+{
+  const mtr_log_t log_mode= mtr->set_log_mode(MTR_LOG_NO_REDO);
+
+  buf_block_t *const block= cursor->block;
+
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+  ut_ad(!is_buf_block_get_page_zip(block));
+  btr_assert_not_corrupted(block, index);
+  ut_ad(fil_page_index_page_check(block->frame));
+  ut_ad(index->is_dummy ||
+        block->page.id().space() == index->table->space->id);
+  ut_ad(index->is_dummy || block->page.id().page_no() != index->page ||
+        !page_has_siblings(block->frame));
+
+  buf_block_t *old= buf_block_alloc();
+  /* Copy the old page to temporary space */
+  memcpy_aligned<UNIV_PAGE_SIZE_MIN>(old->frame, block->frame, srv_page_size);
+
+  btr_search_drop_page_hash_index(block);
+
+  /* Save the cursor position. */
+  const ulint pos= page_rec_get_n_recs_before(cursor->rec);
+
+  page_create(block, mtr, index->table->not_redundant());
+  if (index->is_spatial())
+    block->frame[FIL_PAGE_TYPE + 1]= byte(FIL_PAGE_RTREE);
+
+  static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) ==
+                FIL_PAGE_RTREE, "compatibility");
+
+  /* Copy the records from the temporary space to the recreated page;
+  do not copy the lock bits yet */
+
+  page_copy_rec_list_end_no_locks(block, old, page_get_infimum_rec(old->frame),
+                                  index, mtr);
+
+  /* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */
+  ut_ad(!page_get_max_trx_id(block->frame));
+  memcpy_aligned<8>(PAGE_MAX_TRX_ID + PAGE_HEADER + block->frame,
+                    PAGE_MAX_TRX_ID + PAGE_HEADER + old->frame, 8);
+#ifdef UNIV_DEBUG
+  if (page_get_max_trx_id(block->frame))
+    /* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than
+    clustered index root pages. */
+    ut_ad(dict_index_is_sec_or_ibuf(index)
+          ? page_is_leaf(block->frame)
+          : block->page.id().page_no() == index->page);
+  else
+    /* PAGE_MAX_TRX_ID is unused in clustered index pages (other than
+    the root where it is repurposed as PAGE_ROOT_AUTO_INC), non-leaf
+    pages, and in temporary tables.  It was always zero-initialized in
+    page_create().  PAGE_MAX_TRX_ID must be nonzero on
+    dict_index_is_sec_or_ibuf() leaf pages. */
+    ut_ad(index->table->is_temporary() || !page_is_leaf(block->frame) ||
+          !dict_index_is_sec_or_ibuf(index));
+#endif
+
+  const uint16_t data_size1= page_get_data_size(old->frame);
+  const uint16_t data_size2= page_get_data_size(block->frame);
+  const ulint max1= page_get_max_insert_size_after_reorganize(old->frame, 1);
+  const ulint max2= page_get_max_insert_size_after_reorganize(block->frame, 1);
+
+  if (UNIV_UNLIKELY(data_size1 != data_size2 || max1 != max2))
+    ib::fatal() << "Page old data size " << data_size1
+                << " new data size " << data_size2
+                << ", page old max ins size " << max1
+                << " new max ins size " << max2;
+
+  /* Restore the cursor position. */
+  if (pos)
+    cursor->rec = page_rec_get_nth(block->frame, pos);
+  else
+    ut_ad(cursor->rec == page_get_infimum_rec(block->frame));
+
+  if (block->page.id().page_no() == index->page &&
+      fil_page_get_type(old->frame) == FIL_PAGE_TYPE_INSTANT)
+  {
+    /* Preserve the PAGE_INSTANT information. */
+    ut_ad(index->is_instant());
+    memcpy_aligned<2>(FIL_PAGE_TYPE + block->frame,
+                      FIL_PAGE_TYPE + old->frame, 2);
+    memcpy_aligned<2>(PAGE_HEADER + PAGE_INSTANT + block->frame,
+                      PAGE_HEADER + PAGE_INSTANT + old->frame, 2);
+    if (!index->table->instant);
+    else if (page_is_comp(block->frame))
+    {
+      memcpy(PAGE_NEW_INFIMUM + block->frame,
+             PAGE_NEW_INFIMUM + old->frame, 8);
+      memcpy(PAGE_NEW_SUPREMUM + block->frame,
+             PAGE_NEW_SUPREMUM + old->frame, 8);
+    }
+    else
+    {
+      memcpy(PAGE_OLD_INFIMUM + block->frame,
+             PAGE_OLD_INFIMUM + old->frame, 8);
+      memcpy(PAGE_OLD_SUPREMUM + block->frame,
+             PAGE_OLD_SUPREMUM + old->frame, 8);
+    }
+  }
+
+  ut_ad(!memcmp(old->frame, block->frame, PAGE_HEADER));
+  ut_ad(!memcmp(old->frame + PAGE_MAX_TRX_ID + PAGE_HEADER,
+                block->frame + PAGE_MAX_TRX_ID + PAGE_HEADER,
+                PAGE_DATA - (PAGE_MAX_TRX_ID + PAGE_HEADER)));
+
+  if (!dict_table_is_locking_disabled(index->table))
+    lock_move_reorganize_page(block, old);
+
+  /* Write log for the changes, if needed. */
+  mtr->set_log_mode(log_mode);
+  if (log_mode == MTR_LOG_ALL)
+  {
+    /* Check and log the changes in the page header. */
+    ulint a, e;
+    for (a= PAGE_HEADER, e= PAGE_MAX_TRX_ID + PAGE_HEADER; a < e; a++)
+    {
+      if (old->frame[a] == block->frame[a])
+        continue;
+      while (--e, old->frame[e] == block->frame[e]);
+      e++;
+      ut_ad(a < e);
+      /* Write log for the changed page header fields. */
+      mtr->memcpy(*block, a, e - a);
+      break;
+    }
+
+    const uint16_t top= page_header_get_offs(block->frame, PAGE_HEAP_TOP);
+
+    if (page_is_comp(block->frame))
+    {
+      /* info_bits=0, n_owned=1, heap_no=0, status */
+      ut_ad(!memcmp(PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES + block->frame,
+                    PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES + old->frame, 3));
+      /* If the 'next' pointer of the infimum record has changed, log it. */
+      a= PAGE_NEW_INFIMUM - 2;
+      e= a + 2;
+      if (block->frame[a] == old->frame[a])
+        a++;
+      if (--e, block->frame[e] != old->frame[e])
+        e++;
+      if (ulint len= e - a)
+        mtr->memcpy(*block, a, len);
+      /* The infimum record itself must not change. */
+      ut_ad(!memcmp(PAGE_NEW_INFIMUM + block->frame,
+                    PAGE_NEW_INFIMUM + old->frame, 8));
+      /* Log any change of the n_owned of the supremum record. */
+      a= PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES;
+      if (block->frame[a] != old->frame[a])
+        mtr->memcpy(*block, a, 1);
+      /* The rest of the supremum record must not change. */
+      ut_ad(!memcmp(&block->frame[a + 1], &old->frame[a + 1],
+                    PAGE_NEW_SUPREMUM_END - PAGE_NEW_SUPREMUM +
+                    REC_N_NEW_EXTRA_BYTES - 1));
+
+      /* Log the differences in the payload. */
+      for (a= PAGE_NEW_SUPREMUM_END, e= top; a < e; a++)
+      {
+        if (old->frame[a] == block->frame[a])
+          continue;
+        while (--e, old->frame[e] == block->frame[e]);
+        e++;
+        ut_ad(a < e);
+	/* TODO: write MEMMOVE records to minimize this further! */
+        mtr->memcpy(*block, a, e - a);
+	break;
+      }
+    }
+    else
+    {
+      /* info_bits=0, n_owned=1, heap_no=0, number of fields, 1-byte format */
+      ut_ad(!memcmp(PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES + block->frame,
+                    PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES + old->frame, 4));
+      /* If the 'next' pointer of the infimum record has changed, log it. */
+      a= PAGE_OLD_INFIMUM - 2;
+      e= a + 2;
+      if (block->frame[a] == old->frame[a])
+        a++;
+      if (--e, block->frame[e] != old->frame[e])
+        e++;
+      if (ulint len= e - a)
+        mtr->memcpy(*block, a, len);
+      /* The infimum record itself must not change. */
+      ut_ad(!memcmp(PAGE_OLD_INFIMUM + block->frame,
+                    PAGE_OLD_INFIMUM + old->frame, 8));
+      /* Log any change of the n_owned of the supremum record. */
+      a= PAGE_OLD_SUPREMUM - REC_N_OLD_EXTRA_BYTES;
+      if (block->frame[a] != old->frame[a])
+        mtr->memcpy(*block, a, 1);
+      ut_ad(!memcmp(&block->frame[a + 1], &old->frame[a + 1],
+                    PAGE_OLD_SUPREMUM_END - PAGE_OLD_SUPREMUM +
+                    REC_N_OLD_EXTRA_BYTES - 1));
+
+      /* Log the differences in the payload. */
+      for (a= PAGE_OLD_SUPREMUM_END, e= top; a < e; a++)
+      {
+        if (old->frame[a] == block->frame[a])
+          continue;
+        while (--e, old->frame[e] == block->frame[e]);
+        e++;
+        ut_ad(a < e);
+	/* TODO: write MEMMOVE records to minimize this further! */
+        mtr->memcpy(*block, a, e - a);
+	break;
+      }
+    }
+
+    e= srv_page_size - PAGE_DIR;
+    a= e - PAGE_DIR_SLOT_SIZE * page_dir_get_n_slots(block->frame);
+
+    /* Zero out the payload area. */
+    mtr->memset(*block, top, a - top, 0);
+
+    /* Log changes to the page directory. */
+    for (; a < e; a++)
+    {
+      if (old->frame[a] == block->frame[a])
+        continue;
+      while (--e, old->frame[e] == block->frame[e]);
+      e++;
+      ut_ad(a < e);
+      /* Write log for the changed page directory slots. */
+      mtr->memcpy(*block, a, e - a);
+      break;
+    }
+  }
+
+  buf_block_free(old);
+
+  MONITOR_INC(MONITOR_INDEX_REORG_ATTEMPTS);
+  MONITOR_INC(MONITOR_INDEX_REORG_SUCCESSFUL);
+}
+
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@retval true if the operation was successful
+@retval false if it is a compressed page, and recompression failed */
+bool
+btr_page_reorganize_block(
+	ulint		z_level,/*!< in: compression level to be used
+				if dealing with compressed page */
+	buf_block_t*	block,	/*!< in/out: B-tree page */
+	dict_index_t*	index,	/*!< in: the index tree of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	if (buf_block_get_page_zip(block)) {
+		return page_zip_reorganize(block, index, z_level, mtr, true);
+	}
+
+	page_cur_t	cur;
+	page_cur_set_before_first(block, &cur);
+
+	btr_page_reorganize_low(&cur, index, mtr);
+	return true;
+}
+
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@retval true if the operation was successful
+@retval false if it is a compressed page, and recompression failed */
+bool
+btr_page_reorganize(
+/*================*/
+	page_cur_t*	cursor,	/*!< in/out: page cursor */
+	dict_index_t*	index,	/*!< in: the index tree of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	if (!buf_block_get_page_zip(cursor->block)) {
+		btr_page_reorganize_low(cursor, index, mtr);
+		return true;
+	}
+
+	ulint pos = page_rec_get_n_recs_before(cursor->rec);
+	if (!page_zip_reorganize(cursor->block, index, page_zip_level, mtr,
+				 true)) {
+		return false;
+	}
+	if (pos) {
+		cursor->rec = page_rec_get_nth(cursor->block->frame, pos);
+	} else {
+		ut_ad(cursor->rec == page_get_infimum_rec(
+			      cursor->block->frame));
+	}
+
+	return true;
+}
+
+/** Empty an index page (possibly the root page). @see btr_page_create().
+@param[in,out]	block		page to be emptied
+@param[in,out]	page_zip	compressed page frame, or NULL
+@param[in]	index		index of the page
+@param[in]	level		B-tree level of the page (0=leaf)
+@param[in,out]	mtr		mini-transaction */
+void
+btr_page_empty(
+	buf_block_t*	block,
+	page_zip_des_t*	page_zip,
+	dict_index_t*	index,
+	ulint		level,
+	mtr_t*		mtr)
+{
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(page_zip == buf_block_get_page_zip(block));
+	ut_ad(!index->is_dummy);
+	ut_ad(index->table->space->id == block->page.id().space());
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, block->frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	btr_search_drop_page_hash_index(block);
+
+	/* Recreate the page: note that global data on page (possible
+	segment headers, next page-field, etc.) is preserved intact */
+
+	/* Preserve PAGE_ROOT_AUTO_INC when creating a clustered index
+	root page. */
+	const ib_uint64_t	autoinc
+		= dict_index_is_clust(index)
+		&& index->page == block->page.id().page_no()
+		? page_get_autoinc(block->frame)
+		: 0;
+
+	if (page_zip) {
+		page_create_zip(block, index, level, autoinc, mtr);
+	} else {
+		page_create(block, mtr, index->table->not_redundant());
+		if (index->is_spatial()) {
+			static_assert(((FIL_PAGE_INDEX & 0xff00)
+				       | byte(FIL_PAGE_RTREE))
+				      == FIL_PAGE_RTREE, "compatibility");
+			mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame,
+				      byte(FIL_PAGE_RTREE));
+			if (mach_read_from_8(block->frame
+					     + FIL_RTREE_SPLIT_SEQ_NUM)) {
+				mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM,
+					    8, 0);
+			}
+		}
+		mtr->write<2,mtr_t::MAYBE_NOP>(*block, PAGE_HEADER + PAGE_LEVEL
+					       + block->frame, level);
+		if (autoinc) {
+			mtr->write<8>(*block, PAGE_HEADER + PAGE_MAX_TRX_ID
+				      + block->frame, autoinc);
+		}
+	}
+}
+
+/** Write instant ALTER TABLE metadata to a root page.
+@param[in,out]	root	clustered index root page
+@param[in]	index	clustered index with instant ALTER TABLE
+@param[in,out]	mtr	mini-transaction */
+void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr)
+{
+	ut_ad(index.n_core_fields > 0);
+	ut_ad(index.n_core_fields < REC_MAX_N_FIELDS);
+	ut_ad(index.is_instant());
+	ut_ad(fil_page_get_type(root->frame) == FIL_PAGE_TYPE_INSTANT
+	      || fil_page_get_type(root->frame) == FIL_PAGE_INDEX);
+	ut_ad(!page_has_siblings(root->frame));
+	ut_ad(root->page.id().page_no() == index.page);
+
+	rec_t* infimum = page_get_infimum_rec(root->frame);
+	rec_t* supremum = page_get_supremum_rec(root->frame);
+	byte* page_type = root->frame + FIL_PAGE_TYPE;
+	uint16_t i = page_header_get_field(root->frame, PAGE_INSTANT);
+
+	switch (mach_read_from_2(page_type)) {
+	case FIL_PAGE_TYPE_INSTANT:
+		ut_ad(page_get_instant(root->frame) == index.n_core_fields);
+		if (memcmp(infimum, "infimum", 8)
+		    || memcmp(supremum, "supremum", 8)) {
+			ut_ad(index.table->instant);
+			ut_ad(!memcmp(infimum, field_ref_zero, 8));
+			ut_ad(!memcmp(supremum, field_ref_zero, 7));
+			/* The n_core_null_bytes only matters for
+			ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC tables. */
+			ut_ad(supremum[7] == index.n_core_null_bytes
+			      || !index.table->not_redundant());
+			return;
+		}
+		break;
+	default:
+		ut_ad("wrong page type" == 0);
+		/* fall through */
+	case FIL_PAGE_INDEX:
+		ut_ad(!page_is_comp(root->frame)
+		      || !page_get_instant(root->frame));
+		ut_ad(!memcmp(infimum, "infimum", 8));
+		ut_ad(!memcmp(supremum, "supremum", 8));
+		mtr->write<2>(*root, page_type, FIL_PAGE_TYPE_INSTANT);
+		ut_ad(i <= PAGE_NO_DIRECTION);
+		i |= static_cast<uint16_t>(index.n_core_fields << 3);
+		mtr->write<2>(*root, PAGE_HEADER + PAGE_INSTANT + root->frame,
+			      i);
+		break;
+	}
+
+	if (index.table->instant) {
+		mtr->memset(root, infimum - root->frame, 8, 0);
+		mtr->memset(root, supremum - root->frame, 7, 0);
+		mtr->write<1,mtr_t::MAYBE_NOP>(*root, &supremum[7],
+					       index.n_core_null_bytes);
+	}
+}
+
+/** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE.
+@param[in]      index   clustered index with instant ALTER TABLE
+@param[in]      all     whether to reset FIL_PAGE_TYPE as well
+@param[in,out]  mtr     mini-transaction */
+ATTRIBUTE_COLD
+void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr)
+{
+  ut_ad(!index.table->is_temporary());
+  ut_ad(index.is_primary());
+  if (buf_block_t *root = btr_root_block_get(&index, RW_SX_LATCH, mtr))
+  {
+    byte *page_type= root->frame + FIL_PAGE_TYPE;
+    if (all)
+    {
+      ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT ||
+            mach_read_from_2(page_type) == FIL_PAGE_INDEX);
+      mtr->write<2,mtr_t::MAYBE_NOP>(*root, page_type, FIL_PAGE_INDEX);
+      byte *instant= PAGE_INSTANT + PAGE_HEADER + root->frame;
+      mtr->write<2,mtr_t::MAYBE_NOP>(*root, instant,
+                                     page_ptr_get_direction(instant + 1));
+    }
+    else
+      ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT);
+    static const byte supremuminfimum[8 + 8] = "supremuminfimum";
+    uint16_t infimum, supremum;
+    if (page_is_comp(root->frame))
+    {
+      infimum= PAGE_NEW_INFIMUM;
+      supremum= PAGE_NEW_SUPREMUM;
+    }
+    else
+    {
+      infimum= PAGE_OLD_INFIMUM;
+      supremum= PAGE_OLD_SUPREMUM;
+    }
+    ut_ad(!memcmp(&root->frame[infimum], supremuminfimum + 8, 8) ==
+          !memcmp(&root->frame[supremum], supremuminfimum, 8));
+    mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->frame[infimum],
+                                  supremuminfimum + 8, 8);
+    mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->frame[supremum],
+                                  supremuminfimum, 8);
+  }
+}
+
+/*************************************************************//**
+Makes tree one level higher by splitting the root, and inserts
+the tuple. It is assumed that mtr contains an x-latch on the tree.
+NOTE that the operation of this function must always succeed,
+we cannot reverse it: therefore enough free disk space must be
+guaranteed to be available before this function is called.
+@return inserted record */
+rec_t*
+btr_root_raise_and_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert: must be
+				on the root page; when the function returns,
+				the cursor is positioned on the predecessor
+				of the inserted record */
+	rec_offs**	offsets,/*!< out: offsets on inserted record */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_index_t*	index;
+	ulint		new_page_no;
+	rec_t*		rec;
+	dtuple_t*	node_ptr;
+	ulint		level;
+	rec_t*		node_ptr_rec;
+	page_cur_t*	page_cursor;
+	page_zip_des_t*	root_page_zip;
+	page_zip_des_t*	new_page_zip;
+	buf_block_t*	root;
+	buf_block_t*	new_block;
+
+	root = btr_cur_get_block(cursor);
+	root_page_zip = buf_block_get_page_zip(root);
+	ut_ad(!page_is_empty(root->frame));
+	index = btr_cur_get_index(cursor);
+	ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable));
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!root_page_zip || page_zip_validate(root_page_zip, root->frame,
+						 index));
+#endif /* UNIV_ZIP_DEBUG */
+#ifdef UNIV_BTR_DEBUG
+	if (!dict_index_is_ibuf(index)) {
+		ulint	space = index->table->space_id;
+
+		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+					    + root->frame, space));
+		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+					    + root->frame, space));
+	}
+
+	ut_a(dict_index_get_page(index) == root->page.id().page_no());
+#endif /* UNIV_BTR_DEBUG */
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(root, MTR_MEMO_PAGE_X_FIX));
+
+	/* Allocate a new page to the tree. Root splitting is done by first
+	moving the root records to the new page, emptying the root, putting
+	a node pointer to the new page, and then splitting the new page. */
+
+	level = btr_page_get_level(root->frame);
+
+	new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr);
+
+	if (new_block == NULL && os_has_said_disk_full) {
+		return(NULL);
+	}
+
+	new_page_zip = buf_block_get_page_zip(new_block);
+	ut_a(!new_page_zip == !root_page_zip);
+	ut_a(!new_page_zip
+	     || page_zip_get_size(new_page_zip)
+	     == page_zip_get_size(root_page_zip));
+
+	btr_page_create(new_block, new_page_zip, index, level, mtr);
+	if (page_has_siblings(new_block->frame)) {
+		compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4);
+		compile_time_assert(FIL_NULL == 0xffffffff);
+		static_assert(FIL_PAGE_PREV % 8 == 0, "alignment");
+		memset_aligned<8>(new_block->frame + FIL_PAGE_PREV, 0xff, 8);
+		mtr->memset(new_block, FIL_PAGE_PREV, 8, 0xff);
+		if (UNIV_LIKELY_NULL(new_page_zip)) {
+			memset_aligned<8>(new_page_zip->data + FIL_PAGE_PREV,
+					  0xff, 8);
+		}
+	}
+
+	/* Copy the records from root to the new page one by one. */
+
+	if (0
+#ifdef UNIV_ZIP_COPY
+	    || new_page_zip
+#endif /* UNIV_ZIP_COPY */
+	    || !page_copy_rec_list_end(new_block, root,
+				       page_get_infimum_rec(root->frame),
+				       index, mtr)) {
+		ut_a(new_page_zip);
+
+		/* Copy the page byte for byte. */
+		page_zip_copy_recs(new_block,
+				   root_page_zip, root->frame, index, mtr);
+
+		/* Update the lock table and possible hash index. */
+		lock_move_rec_list_end(new_block, root,
+				       page_get_infimum_rec(root->frame));
+
+		/* Move any existing predicate locks */
+		if (dict_index_is_spatial(index)) {
+			lock_prdt_rec_move(new_block, root);
+		} else {
+			btr_search_move_or_delete_hash_entries(
+				new_block, root);
+		}
+	}
+
+	constexpr uint16_t max_trx_id = PAGE_HEADER + PAGE_MAX_TRX_ID;
+	if (dict_index_is_sec_or_ibuf(index)) {
+		/* In secondary indexes and the change buffer,
+		PAGE_MAX_TRX_ID can be reset on the root page, because
+		the field only matters on leaf pages, and the root no
+		longer is a leaf page. (Older versions of InnoDB did
+		set PAGE_MAX_TRX_ID on all secondary index pages.) */
+		byte* p = my_assume_aligned<8>(
+			PAGE_HEADER + PAGE_MAX_TRX_ID + root->frame);
+		if (mach_read_from_8(p)) {
+			mtr->memset(root, max_trx_id, 8, 0);
+			if (UNIV_LIKELY_NULL(root->page.zip.data)) {
+				memset_aligned<8>(max_trx_id
+						  + root->page.zip.data, 0, 8);
+			}
+		}
+	} else {
+		/* PAGE_ROOT_AUTO_INC is only present in the clustered index
+		root page; on other clustered index pages, we want to reserve
+		the field PAGE_MAX_TRX_ID for future use. */
+		byte* p = my_assume_aligned<8>(
+			PAGE_HEADER + PAGE_MAX_TRX_ID + new_block->frame);
+		if (mach_read_from_8(p)) {
+			mtr->memset(new_block, max_trx_id, 8, 0);
+			if (UNIV_LIKELY_NULL(new_block->page.zip.data)) {
+				memset_aligned<8>(max_trx_id
+						  + new_block->page.zip.data,
+						  0, 8);
+			}
+		}
+	}
+
+	/* If this is a pessimistic insert which is actually done to
+	perform a pessimistic update then we have stored the lock
+	information of the record to be inserted on the infimum of the
+	root page: we cannot discard the lock structs on the root page */
+
+	if (!dict_table_is_locking_disabled(index->table)) {
+		lock_update_root_raise(new_block, root);
+	}
+
+	/* Create a memory heap where the node pointer is stored */
+	if (!*heap) {
+		*heap = mem_heap_create(1000);
+	}
+
+	rec = page_rec_get_next(page_get_infimum_rec(new_block->frame));
+	new_page_no = new_block->page.id().page_no();
+
+	/* Build the node pointer (= node key and page address) for the
+	child */
+	if (dict_index_is_spatial(index)) {
+		rtr_mbr_t		new_mbr;
+
+		rtr_page_cal_mbr(index, new_block, &new_mbr, *heap);
+		node_ptr = rtr_index_build_node_ptr(
+			index, &new_mbr, rec, new_page_no, *heap);
+	} else {
+		node_ptr = dict_index_build_node_ptr(
+			index, rec, new_page_no, *heap, level);
+	}
+	/* The node pointer must be marked as the predefined minimum record,
+	as there is no lower alphabetical limit to records in the leftmost
+	node of a level: */
+	dtuple_set_info_bits(node_ptr,
+			     dtuple_get_info_bits(node_ptr)
+			     | REC_INFO_MIN_REC_FLAG);
+
+	/* Rebuild the root page to get free space */
+	btr_page_empty(root, root_page_zip, index, level + 1, mtr);
+	/* btr_page_empty() is supposed to zero-initialize the field. */
+	ut_ad(!page_get_instant(root->frame));
+
+	if (index->is_instant()) {
+		ut_ad(!root_page_zip);
+		btr_set_instant(root, *index, mtr);
+	}
+
+	ut_ad(!page_has_siblings(root->frame));
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	/* Insert node pointer to the root */
+
+	page_cur_set_before_first(root, page_cursor);
+
+	node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr,
+					     index, offsets, heap, 0, mtr);
+
+	/* The root page should only contain the node pointer
+	to new_block at this point.  Thus, the data should fit. */
+	ut_a(node_ptr_rec);
+
+	/* We play safe and reset the free bits for the new page */
+
+	if (!dict_index_is_clust(index)
+	    && !index->table->is_temporary()) {
+		ibuf_reset_free_bits(new_block);
+	}
+
+	if (tuple != NULL) {
+		/* Reposition the cursor to the child node */
+		page_cur_search(new_block, index, tuple, page_cursor);
+	} else {
+		/* Set cursor to first record on child node */
+		page_cur_set_before_first(new_block, page_cursor);
+	}
+
+	/* Split the child and insert tuple */
+	if (dict_index_is_spatial(index)) {
+		/* Split rtree page and insert tuple */
+		return(rtr_page_split_and_insert(flags, cursor, offsets, heap,
+						 tuple, n_ext, mtr));
+	} else {
+		return(btr_page_split_and_insert(flags, cursor, offsets, heap,
+						 tuple, n_ext, mtr));
+	}
+}
+
+/** Decide if the page should be split at the convergence point of inserts
+converging to the left.
+@param[in]	cursor	insert position
+@return the first record to be moved to the right half page
+@retval	NULL if no split is recommended */
+rec_t* btr_page_get_split_rec_to_left(const btr_cur_t* cursor)
+{
+	rec_t* split_rec = btr_cur_get_rec(cursor);
+	const page_t* page = page_align(split_rec);
+
+	if (page_header_get_ptr(page, PAGE_LAST_INSERT)
+	    != page_rec_get_next(split_rec)) {
+		return NULL;
+	}
+
+	/* The metadata record must be present in the leftmost leaf page
+	of the clustered index, if and only if index->is_instant().
+	However, during innobase_instant_try(), index->is_instant()
+	would already hold when row_ins_clust_index_entry_low()
+	is being invoked to insert the the metadata record.
+	So, we can only assert that when the metadata record exists,
+	index->is_instant() must hold. */
+	ut_ad(!page_is_leaf(page) || page_has_prev(page)
+	      || cursor->index->is_instant()
+	      || !(rec_get_info_bits(page_rec_get_next_const(
+					     page_get_infimum_rec(page)),
+				     cursor->index->table->not_redundant())
+		   & REC_INFO_MIN_REC_FLAG));
+
+	const rec_t* infimum = page_get_infimum_rec(page);
+
+	/* If the convergence is in the middle of a page, include also
+	the record immediately before the new insert to the upper
+	page. Otherwise, we could repeatedly move from page to page
+	lots of records smaller than the convergence point. */
+
+	if (split_rec == infimum
+	    || split_rec == page_rec_get_next_const(infimum)) {
+		split_rec = page_rec_get_next(split_rec);
+	}
+
+	return split_rec;
+}
+
+/** Decide if the page should be split at the convergence point of inserts
+converging to the right.
+@param[in]	cursor		insert position
+@param[out]	split_rec	if split recommended, the first record
+				on the right half page, or
+				NULL if the to-be-inserted record
+				should be first
+@return whether split is recommended */
+bool
+btr_page_get_split_rec_to_right(const btr_cur_t* cursor, rec_t** split_rec)
+{
+	rec_t* insert_point = btr_cur_get_rec(cursor);
+	const page_t* page = page_align(insert_point);
+
+	/* We use eager heuristics: if the new insert would be right after
+	the previous insert on the same page, we assume that there is a
+	pattern of sequential inserts here. */
+
+	if (page_header_get_ptr(page, PAGE_LAST_INSERT) != insert_point) {
+		return false;
+	}
+
+	insert_point = page_rec_get_next(insert_point);
+
+	if (page_rec_is_supremum(insert_point)) {
+		insert_point = NULL;
+	} else {
+		insert_point = page_rec_get_next(insert_point);
+		if (page_rec_is_supremum(insert_point)) {
+			insert_point = NULL;
+		}
+
+		/* If there are >= 2 user records up from the insert
+		point, split all but 1 off. We want to keep one because
+		then sequential inserts can use the adaptive hash
+		index, as they can do the necessary checks of the right
+		search position just by looking at the records on this
+		page. */
+	}
+
+	*split_rec = insert_point;
+	return true;
+}
+
+/*************************************************************//**
+Calculates a split record such that the tuple will certainly fit on
+its half-page when the split is performed. We assume in this function
+only that the cursor page has at least one user record.
+@return split record, or NULL if tuple will be the first record on
+the lower or upper half-page (determined by btr_page_tuple_smaller()) */
+static
+rec_t*
+btr_page_get_split_rec(
+/*===================*/
+	btr_cur_t*	cursor,	/*!< in: cursor at which insert should be made */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+{
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	ulint		insert_size;
+	ulint		free_space;
+	ulint		total_data;
+	ulint		total_n_recs;
+	ulint		total_space;
+	ulint		incl_data;
+	rec_t*		ins_rec;
+	rec_t*		rec;
+	rec_t*		next_rec;
+	ulint		n;
+	mem_heap_t*	heap;
+	rec_offs*	offsets;
+
+	page = btr_cur_get_page(cursor);
+
+	insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
+	free_space  = page_get_free_space_of_empty(page_is_comp(page));
+
+	page_zip = btr_cur_get_page_zip(cursor);
+	if (page_zip) {
+		/* Estimate the free space of an empty compressed page. */
+		ulint	free_space_zip = page_zip_empty_size(
+			cursor->index->n_fields,
+			page_zip_get_size(page_zip));
+
+		if (free_space > (ulint) free_space_zip) {
+			free_space = (ulint) free_space_zip;
+		}
+	}
+
+	/* free_space is now the free space of a created new page */
+
+	total_data   = page_get_data_size(page) + insert_size;
+	total_n_recs = ulint(page_get_n_recs(page)) + 1;
+	ut_ad(total_n_recs >= 2);
+	total_space  = total_data + page_dir_calc_reserved_space(total_n_recs);
+
+	n = 0;
+	incl_data = 0;
+	ins_rec = btr_cur_get_rec(cursor);
+	rec = page_get_infimum_rec(page);
+
+	heap = NULL;
+	offsets = NULL;
+
+	/* We start to include records to the left half, and when the
+	space reserved by them exceeds half of total_space, then if
+	the included records fit on the left page, they will be put there
+	if something was left over also for the right page,
+	otherwise the last included record will be the first on the right
+	half page */
+
+	do {
+		/* Decide the next record to include */
+		if (rec == ins_rec) {
+			rec = NULL;	/* NULL denotes that tuple is
+					now included */
+		} else if (rec == NULL) {
+			rec = page_rec_get_next(ins_rec);
+		} else {
+			rec = page_rec_get_next(rec);
+		}
+
+		if (rec == NULL) {
+			/* Include tuple */
+			incl_data += insert_size;
+		} else {
+			offsets = rec_get_offsets(rec, cursor->index, offsets,
+						  page_is_leaf(page)
+						  ? cursor->index->n_core_fields
+						  : 0,
+						  ULINT_UNDEFINED, &heap);
+			incl_data += rec_offs_size(offsets);
+		}
+
+		n++;
+	} while (incl_data + page_dir_calc_reserved_space(n)
+		 < total_space / 2);
+
+	if (incl_data + page_dir_calc_reserved_space(n) <= free_space) {
+		/* The next record will be the first on
+		the right half page if it is not the
+		supremum record of page */
+
+		if (rec == ins_rec) {
+			rec = NULL;
+
+			goto func_exit;
+		} else if (rec == NULL) {
+			next_rec = page_rec_get_next(ins_rec);
+		} else {
+			next_rec = page_rec_get_next(rec);
+		}
+		ut_ad(next_rec);
+		if (!page_rec_is_supremum(next_rec)) {
+			rec = next_rec;
+		}
+	}
+
+func_exit:
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	return(rec);
+}
+
+/*************************************************************//**
+Returns TRUE if the insert fits on the appropriate half-page with the
+chosen split_rec.
+@return true if fits */
+static MY_ATTRIBUTE((nonnull(1,3,4,6), warn_unused_result))
+bool
+btr_page_insert_fits(
+/*=================*/
+	btr_cur_t*	cursor,	/*!< in: cursor at which insert
+				should be made */
+	const rec_t*	split_rec,/*!< in: suggestion for first record
+				on upper half-page, or NULL if
+				tuple to be inserted should be first */
+	rec_offs**	offsets,/*!< in: rec_get_offsets(
+				split_rec, cursor->index); out: garbage */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mem_heap_t**	heap)	/*!< in: temporary memory heap */
+{
+	page_t*		page;
+	ulint		insert_size;
+	ulint		free_space;
+	ulint		total_data;
+	ulint		total_n_recs;
+	const rec_t*	rec;
+	const rec_t*	end_rec;
+
+	page = btr_cur_get_page(cursor);
+
+	ut_ad(!split_rec
+	      || !page_is_comp(page) == !rec_offs_comp(*offsets));
+	ut_ad(!split_rec
+	      || rec_offs_validate(split_rec, cursor->index, *offsets));
+
+	insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
+	free_space  = page_get_free_space_of_empty(page_is_comp(page));
+
+	/* free_space is now the free space of a created new page */
+
+	total_data   = page_get_data_size(page) + insert_size;
+	total_n_recs = ulint(page_get_n_recs(page)) + 1;
+
+	/* We determine which records (from rec to end_rec, not including
+	end_rec) will end up on the other half page from tuple when it is
+	inserted. */
+
+	if (split_rec == NULL) {
+		rec = page_rec_get_next(page_get_infimum_rec(page));
+		end_rec = page_rec_get_next(btr_cur_get_rec(cursor));
+
+	} else if (cmp_dtuple_rec(tuple, split_rec, *offsets) >= 0) {
+
+		rec = page_rec_get_next(page_get_infimum_rec(page));
+		end_rec = split_rec;
+	} else {
+		rec = split_rec;
+		end_rec = page_get_supremum_rec(page);
+	}
+
+	if (total_data + page_dir_calc_reserved_space(total_n_recs)
+	    <= free_space) {
+
+		/* Ok, there will be enough available space on the
+		half page where the tuple is inserted */
+
+		return(true);
+	}
+
+	while (rec != end_rec) {
+		/* In this loop we calculate the amount of reserved
+		space after rec is removed from page. */
+
+		*offsets = rec_get_offsets(rec, cursor->index, *offsets,
+					   page_is_leaf(page)
+					   ? cursor->index->n_core_fields
+					   : 0,
+					   ULINT_UNDEFINED, heap);
+
+		total_data -= rec_offs_size(*offsets);
+		total_n_recs--;
+
+		if (total_data + page_dir_calc_reserved_space(total_n_recs)
+		    <= free_space) {
+
+			/* Ok, there will be enough available space on the
+			half page where the tuple is inserted */
+
+			return(true);
+		}
+
+		rec = page_rec_get_next_const(rec);
+	}
+
+	return(false);
+}
+
+/*******************************************************//**
+Inserts a data tuple to a tree on a non-leaf level. It is assumed
+that mtr holds an x-latch on the tree. */
+void
+btr_insert_on_non_leaf_level_func(
+/*==============================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: level, must be > 0 */
+	dtuple_t*	tuple,	/*!< in: the record to be inserted */
+	const char*	file,	/*!< in: file name */
+	unsigned	line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	big_rec_t*	dummy_big_rec;
+	btr_cur_t	cursor;
+	dberr_t		err;
+	rec_t*		rec;
+	mem_heap_t*	heap = NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets         = offsets_;
+	rec_offs_init(offsets_);
+	rtr_info_t	rtr_info;
+
+	ut_ad(level > 0);
+
+	if (!dict_index_is_spatial(index)) {
+		dberr_t err = btr_cur_search_to_nth_level(
+			index, level, tuple, PAGE_CUR_LE,
+			BTR_CONT_MODIFY_TREE,
+			&cursor, 0, file, line, mtr);
+
+		if (err != DB_SUCCESS) {
+			ib::warn() << " Error code: " << err
+				   << " btr_page_get_father_node_ptr_func "
+				   << " level: " << level
+				   << " called from file: "
+				   << file << " line: " << line
+				   << " table: " << index->table->name
+				   << " index: " << index->name;
+		}
+	} else {
+		/* For spatial index, initialize structures to track
+		its parents etc. */
+		rtr_init_rtr_info(&rtr_info, false, &cursor, index, false);
+
+		rtr_info_update_btr(&cursor, &rtr_info);
+
+		btr_cur_search_to_nth_level(index, level, tuple,
+					    PAGE_CUR_RTREE_INSERT,
+					    BTR_CONT_MODIFY_TREE,
+					    &cursor, 0, file, line, mtr);
+	}
+
+	ut_ad(cursor.flag == BTR_CUR_BINARY);
+
+	err = btr_cur_optimistic_insert(
+		flags
+		| BTR_NO_LOCKING_FLAG
+		| BTR_KEEP_SYS_FLAG
+		| BTR_NO_UNDO_LOG_FLAG,
+		&cursor, &offsets, &heap,
+		tuple, &rec, &dummy_big_rec, 0, NULL, mtr);
+
+	if (err == DB_FAIL) {
+		err = btr_cur_pessimistic_insert(flags
+						 | BTR_NO_LOCKING_FLAG
+						 | BTR_KEEP_SYS_FLAG
+						 | BTR_NO_UNDO_LOG_FLAG,
+						 &cursor, &offsets, &heap,
+						 tuple, &rec,
+						 &dummy_big_rec, 0, NULL, mtr);
+		ut_a(err == DB_SUCCESS);
+	}
+
+	if (heap != NULL) {
+		mem_heap_free(heap);
+	}
+
+	if (dict_index_is_spatial(index)) {
+		ut_ad(cursor.rtr_info);
+
+		rtr_clean_rtr_info(&rtr_info, true);
+	}
+}
+
+/**************************************************************//**
+Attaches the halves of an index page on the appropriate level in an
+index tree. */
+static MY_ATTRIBUTE((nonnull))
+void
+btr_attach_half_pages(
+/*==================*/
+	ulint		flags,		/*!< in: undo logging and
+					locking flags */
+	dict_index_t*	index,		/*!< in: the index tree */
+	buf_block_t*	block,		/*!< in/out: page to be split */
+	const rec_t*	split_rec,	/*!< in: first record on upper
+					half page */
+	buf_block_t*	new_block,	/*!< in/out: the new half page */
+	ulint		direction,	/*!< in: FSP_UP or FSP_DOWN */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	dtuple_t*	node_ptr_upper;
+	mem_heap_t*	heap;
+	buf_block_t*	prev_block = NULL;
+	buf_block_t*	next_block = NULL;
+	buf_block_t*	lower_block;
+	buf_block_t*	upper_block;
+
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(new_block, MTR_MEMO_PAGE_X_FIX));
+
+	/* Create a memory heap where the data tuple is stored */
+	heap = mem_heap_create(1024);
+
+	/* Based on split direction, decide upper and lower pages */
+	if (direction == FSP_DOWN) {
+
+		btr_cur_t	cursor;
+		rec_offs*	offsets;
+
+		lower_block = new_block;
+		upper_block = block;
+
+		/* Look up the index for the node pointer to page */
+		offsets = btr_page_get_father_block(NULL, heap, index,
+						    block, mtr, &cursor);
+
+		/* Replace the address of the old child node (= page) with the
+		address of the new lower half */
+
+		btr_node_ptr_set_child_page_no(
+			btr_cur_get_block(&cursor),
+			btr_cur_get_rec(&cursor),
+			offsets, lower_block->page.id().page_no(), mtr);
+		mem_heap_empty(heap);
+	} else {
+		lower_block = block;
+		upper_block = new_block;
+	}
+
+	/* Get the level of the split pages */
+	const ulint level = btr_page_get_level(buf_block_get_frame(block));
+	ut_ad(level == btr_page_get_level(buf_block_get_frame(new_block)));
+
+	/* Get the previous and next pages of page */
+	const uint32_t prev_page_no = btr_page_get_prev(block->frame);
+	const uint32_t next_page_no = btr_page_get_next(block->frame);
+
+	/* for consistency, both blocks should be locked, before change */
+	if (prev_page_no != FIL_NULL && direction == FSP_DOWN) {
+		prev_block = btr_block_get(*index, prev_page_no, RW_X_LATCH,
+					   !level, mtr);
+	}
+	if (next_page_no != FIL_NULL && direction != FSP_DOWN) {
+		next_block = btr_block_get(*index, next_page_no, RW_X_LATCH,
+					   !level, mtr);
+	}
+
+	/* Build the node pointer (= node key and page address) for the upper
+	half */
+
+	node_ptr_upper = dict_index_build_node_ptr(
+		index, split_rec, upper_block->page.id().page_no(),
+		heap, level);
+
+	/* Insert it next to the pointer to the lower half. Note that this
+	may generate recursion leading to a split on the higher level. */
+
+	btr_insert_on_non_leaf_level(flags, index, level + 1,
+				     node_ptr_upper, mtr);
+
+	/* Free the memory heap */
+	mem_heap_free(heap);
+
+	/* Update page links of the level */
+
+	if (prev_block) {
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(prev_block->frame)
+		     == page_is_comp(block->frame));
+		ut_a(btr_page_get_next(prev_block->frame)
+		     == block->page.id().page_no());
+#endif /* UNIV_BTR_DEBUG */
+		btr_page_set_next(prev_block, lower_block->page.id().page_no(),
+				  mtr);
+	}
+
+	if (next_block) {
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(next_block->frame)
+		     == page_is_comp(block->frame));
+		ut_a(btr_page_get_prev(next_block->frame)
+		     == block->page.id().page_no());
+#endif /* UNIV_BTR_DEBUG */
+		btr_page_set_prev(next_block, upper_block->page.id().page_no(),
+				  mtr);
+	}
+
+	if (direction == FSP_DOWN) {
+		ut_ad(lower_block == new_block);
+		ut_ad(btr_page_get_next(upper_block->frame) == next_page_no);
+		btr_page_set_prev(lower_block, prev_page_no, mtr);
+	} else {
+		ut_ad(upper_block == new_block);
+		ut_ad(btr_page_get_prev(lower_block->frame) == prev_page_no);
+		btr_page_set_next(upper_block, next_page_no, mtr);
+	}
+
+	btr_page_set_prev(upper_block, lower_block->page.id().page_no(), mtr);
+	btr_page_set_next(lower_block, upper_block->page.id().page_no(), mtr);
+}
+
+/*************************************************************//**
+Determine if a tuple is smaller than any record on the page.
+@return TRUE if smaller */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+btr_page_tuple_smaller(
+/*===================*/
+	btr_cur_t*	cursor,	/*!< in: b-tree cursor */
+	const dtuple_t*	tuple,	/*!< in: tuple to consider */
+	rec_offs**	offsets,/*!< in/out: temporary storage */
+	ulint		n_uniq,	/*!< in: number of unique fields
+				in the index page records */
+	mem_heap_t**	heap)	/*!< in/out: heap for offsets */
+{
+	buf_block_t*	block;
+	const rec_t*	first_rec;
+	page_cur_t	pcur;
+
+	/* Read the first user record in the page. */
+	block = btr_cur_get_block(cursor);
+	page_cur_set_before_first(block, &pcur);
+	page_cur_move_to_next(&pcur);
+	first_rec = page_cur_get_rec(&pcur);
+
+	*offsets = rec_get_offsets(
+		first_rec, cursor->index, *offsets,
+		page_is_leaf(block->frame) ? cursor->index->n_core_fields : 0,
+		n_uniq, heap);
+
+	return(cmp_dtuple_rec(tuple, first_rec, *offsets) < 0);
+}
+
+/** Insert the tuple into the right sibling page, if the cursor is at the end
+of a page.
+@param[in]	flags	undo logging and locking flags
+@param[in,out]	cursor	cursor at which to insert; when the function succeeds,
+			the cursor is positioned before the insert point.
+@param[out]	offsets	offsets on inserted record
+@param[in,out]	heap	memory heap for allocating offsets
+@param[in]	tuple	tuple to insert
+@param[in]	n_ext	number of externally stored columns
+@param[in,out]	mtr	mini-transaction
+@return	inserted record (first record on the right sibling page);
+	the cursor will be positioned on the page infimum
+@retval	NULL if the operation was not performed */
+static
+rec_t*
+btr_insert_into_right_sibling(
+	ulint		flags,
+	btr_cur_t*	cursor,
+	rec_offs**	offsets,
+	mem_heap_t*	heap,
+	const dtuple_t*	tuple,
+	ulint		n_ext,
+	mtr_t*		mtr)
+{
+	buf_block_t*	block = btr_cur_get_block(cursor);
+	page_t*		page = buf_block_get_frame(block);
+	const uint32_t	next_page_no = btr_page_get_next(page);
+
+	ut_ad(mtr->memo_contains_flagged(&cursor->index->lock,
+					 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(heap);
+
+	if (next_page_no == FIL_NULL || !page_rec_is_supremum(
+			page_rec_get_next(btr_cur_get_rec(cursor)))) {
+
+		return(NULL);
+	}
+
+	page_cur_t	next_page_cursor;
+	buf_block_t*	next_block;
+	page_t*		next_page;
+	btr_cur_t	next_father_cursor;
+	rec_t*		rec = NULL;
+	ulint		max_size;
+
+	next_block = btr_block_get(*cursor->index, next_page_no, RW_X_LATCH,
+				   page_is_leaf(page), mtr);
+	if (UNIV_UNLIKELY(!next_block)) {
+		return NULL;
+	}
+	next_page = buf_block_get_frame(next_block);
+
+	bool	is_leaf = page_is_leaf(next_page);
+
+	btr_page_get_father(
+		cursor->index, next_block, mtr, &next_father_cursor);
+
+	page_cur_search(
+		next_block, cursor->index, tuple, PAGE_CUR_LE,
+		&next_page_cursor);
+
+	max_size = page_get_max_insert_size_after_reorganize(next_page, 1);
+
+	/* Extends gap lock for the next page */
+	if (!dict_table_is_locking_disabled(cursor->index->table)) {
+		lock_update_split_left(next_block, block);
+	}
+
+	rec = page_cur_tuple_insert(
+		&next_page_cursor, tuple, cursor->index, offsets, &heap,
+		n_ext, mtr);
+
+	if (rec == NULL) {
+		if (is_leaf
+		    && next_block->page.zip.ssize
+		    && !dict_index_is_clust(cursor->index)
+		    && !cursor->index->table->is_temporary()) {
+			/* Reset the IBUF_BITMAP_FREE bits, because
+			page_cur_tuple_insert() will have attempted page
+			reorganize before failing. */
+			ibuf_reset_free_bits(next_block);
+		}
+		return(NULL);
+	}
+
+	ibool	compressed;
+	dberr_t	err;
+	ulint	level = btr_page_get_level(next_page);
+
+	/* adjust cursor position */
+	*btr_cur_get_page_cur(cursor) = next_page_cursor;
+
+	ut_ad(btr_cur_get_rec(cursor) == page_get_infimum_rec(next_page));
+	ut_ad(page_rec_get_next(page_get_infimum_rec(next_page)) == rec);
+
+	/* We have to change the parent node pointer */
+
+	compressed = btr_cur_pessimistic_delete(
+		&err, TRUE, &next_father_cursor,
+		BTR_CREATE_FLAG, false, mtr);
+
+	ut_a(err == DB_SUCCESS);
+
+	if (!compressed) {
+		btr_cur_compress_if_useful(&next_father_cursor, FALSE, mtr);
+	}
+
+	dtuple_t*	node_ptr = dict_index_build_node_ptr(
+		cursor->index, rec, next_block->page.id().page_no(),
+		heap, level);
+
+	btr_insert_on_non_leaf_level(
+		flags, cursor->index, level + 1, node_ptr, mtr);
+
+	ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
+
+	if (is_leaf
+	    && !dict_index_is_clust(cursor->index)
+	    && !cursor->index->table->is_temporary()) {
+		/* Update the free bits of the B-tree page in the
+		insert buffer bitmap. */
+
+		if (next_block->page.zip.ssize) {
+			ibuf_update_free_bits_zip(next_block, mtr);
+		} else {
+			ibuf_update_free_bits_if_full(
+				next_block, max_size,
+				rec_offs_size(*offsets) + PAGE_DIR_SLOT_SIZE);
+		}
+	}
+
+	return(rec);
+}
+
+/*************************************************************//**
+Splits an index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
+released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore enough
+free disk space (2 pages) must be guaranteed to be available before
+this function is called.
+NOTE: jonaso added support for calling function with tuple == NULL
+which cause it to only split a page.
+
+@return inserted record or NULL if run out of space */
+rec_t*
+btr_page_split_and_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert; when the
+				function returns, the cursor is positioned
+				on the predecessor of the inserted record */
+	rec_offs**	offsets,/*!< out: offsets on inserted record */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	buf_block_t*	block;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	buf_block_t*	new_block;
+	page_t*		new_page;
+	page_zip_des_t*	new_page_zip;
+	rec_t*		split_rec;
+	buf_block_t*	left_block;
+	buf_block_t*	right_block;
+	page_cur_t*	page_cursor;
+	rec_t*		first_rec;
+	byte*		buf = 0; /* remove warning */
+	rec_t*		move_limit;
+	ulint		n_iterations = 0;
+	ulint		n_uniq;
+
+	if (cursor->index->is_spatial()) {
+		/* Split rtree page and update parent */
+		return(rtr_page_split_and_insert(flags, cursor, offsets, heap,
+						 tuple, n_ext, mtr));
+	}
+
+	if (!*heap) {
+		*heap = mem_heap_create(1024);
+	}
+	n_uniq = dict_index_get_n_unique_in_tree(cursor->index);
+func_start:
+	mem_heap_empty(*heap);
+	*offsets = NULL;
+
+	ut_ad(mtr->memo_contains_flagged(&cursor->index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+	ut_ad(!dict_index_is_online_ddl(cursor->index)
+	      || (flags & BTR_CREATE_FLAG)
+	      || dict_index_is_clust(cursor->index));
+	ut_ad(rw_lock_own_flagged(dict_index_get_lock(cursor->index),
+				  RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
+
+	block = btr_cur_get_block(cursor);
+	page = buf_block_get_frame(block);
+	page_zip = buf_block_get_page_zip(block);
+
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(!page_is_empty(page));
+
+	/* try to insert to the next page if possible before split */
+	if (rec_t* rec = btr_insert_into_right_sibling(
+		    flags, cursor, offsets, *heap, tuple, n_ext, mtr)) {
+		return(rec);
+	}
+
+	/* 1. Decide the split record; split_rec == NULL means that the
+	tuple to be inserted should be the first record on the upper
+	half-page */
+	bool insert_left = false;
+	uint32_t hint_page_no = block->page.id().page_no() + 1;
+	byte direction = FSP_UP;
+
+	if (tuple && n_iterations > 0) {
+		split_rec = btr_page_get_split_rec(cursor, tuple, n_ext);
+
+		if (split_rec == NULL) {
+			insert_left = btr_page_tuple_smaller(
+				cursor, tuple, offsets, n_uniq, heap);
+		}
+	} else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) {
+	} else if ((split_rec = btr_page_get_split_rec_to_left(cursor))) {
+		direction = FSP_DOWN;
+		hint_page_no -= 2;
+	} else {
+		/* If there is only one record in the index page, we
+		can't split the node in the middle by default. We need
+		to determine whether the new record will be inserted
+		to the left or right. */
+
+		if (page_get_n_recs(page) > 1) {
+			split_rec = page_get_middle_rec(page);
+		} else if (btr_page_tuple_smaller(cursor, tuple,
+						  offsets, n_uniq, heap)) {
+			split_rec = page_rec_get_next(
+				page_get_infimum_rec(page));
+		} else {
+			split_rec = NULL;
+		}
+	}
+
+	DBUG_EXECUTE_IF("disk_is_full",
+			os_has_said_disk_full = true;
+			return(NULL););
+
+	/* 2. Allocate a new page to the index */
+	const uint16_t page_level = btr_page_get_level(page);
+	new_block = btr_page_alloc(cursor->index, hint_page_no, direction,
+				   page_level, mtr, mtr);
+
+	if (!new_block) {
+		return(NULL);
+	}
+
+	new_page = buf_block_get_frame(new_block);
+	new_page_zip = buf_block_get_page_zip(new_block);
+
+	if (page_level && UNIV_LIKELY_NULL(new_page_zip)) {
+		/* ROW_FORMAT=COMPRESSED non-leaf pages are not expected
+		to contain FIL_NULL in FIL_PAGE_PREV at this stage. */
+		memset_aligned<4>(new_page + FIL_PAGE_PREV, 0, 4);
+	}
+	btr_page_create(new_block, new_page_zip, cursor->index,
+			page_level, mtr);
+	/* Only record the leaf level page splits. */
+	if (!page_level) {
+		cursor->index->stat_defrag_n_page_split ++;
+		cursor->index->stat_defrag_modified_counter ++;
+		btr_defragment_save_defrag_stats_if_needed(cursor->index);
+	}
+
+	/* 3. Calculate the first record on the upper half-page, and the
+	first record (move_limit) on original page which ends up on the
+	upper half */
+
+	if (split_rec) {
+		first_rec = move_limit = split_rec;
+
+		*offsets = rec_get_offsets(split_rec, cursor->index, *offsets,
+					   page_is_leaf(page)
+					   ? cursor->index->n_core_fields : 0,
+					   n_uniq, heap);
+
+		insert_left = !tuple
+			|| cmp_dtuple_rec(tuple, split_rec, *offsets) < 0;
+
+		if (!insert_left && new_page_zip && n_iterations > 0) {
+			/* If a compressed page has already been split,
+			avoid further splits by inserting the record
+			to an empty page. */
+			split_rec = NULL;
+			goto insert_empty;
+		}
+	} else if (insert_left) {
+		ut_a(n_iterations > 0);
+		first_rec = page_rec_get_next(page_get_infimum_rec(page));
+		move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
+	} else {
+insert_empty:
+		ut_ad(!split_rec);
+		ut_ad(!insert_left);
+		buf = UT_NEW_ARRAY_NOKEY(
+			byte,
+			rec_get_converted_size(cursor->index, tuple, n_ext));
+
+		first_rec = rec_convert_dtuple_to_rec(buf, cursor->index,
+						      tuple, n_ext);
+		move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
+	}
+
+	/* 4. Do first the modifications in the tree structure */
+
+	/* FIXME: write FIL_PAGE_PREV,FIL_PAGE_NEXT in new_block earlier! */
+	btr_attach_half_pages(flags, cursor->index, block,
+			      first_rec, new_block, direction, mtr);
+
+	/* If the split is made on the leaf level and the insert will fit
+	on the appropriate half-page, we may release the tree x-latch.
+	We can then move the records after releasing the tree latch,
+	thus reducing the tree latch contention. */
+	bool insert_will_fit;
+	if (tuple == NULL) {
+		insert_will_fit = true;
+	} else if (split_rec) {
+		insert_will_fit = !new_page_zip
+			&& btr_page_insert_fits(cursor, split_rec,
+						offsets, tuple, n_ext, heap);
+	} else {
+		if (!insert_left) {
+			UT_DELETE_ARRAY(buf);
+			buf = NULL;
+		}
+
+		insert_will_fit = !new_page_zip
+			&& btr_page_insert_fits(cursor, NULL,
+						offsets, tuple, n_ext, heap);
+	}
+
+	if (!srv_read_only_mode
+	    && insert_will_fit
+	    && page_is_leaf(page)
+	    && !dict_index_is_online_ddl(cursor->index)) {
+
+		mtr->memo_release(
+			dict_index_get_lock(cursor->index),
+			MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
+
+		/* NOTE: We cannot release root block latch here, because it
+		has segment header and already modified in most of cases.*/
+	}
+
+	/* 5. Move then the records to the new page */
+	if (direction == FSP_DOWN) {
+		/*		fputs("Split left\n", stderr); */
+
+		if (0
+#ifdef UNIV_ZIP_COPY
+		    || page_zip
+#endif /* UNIV_ZIP_COPY */
+		    || !page_move_rec_list_start(new_block, block, move_limit,
+						 cursor->index, mtr)) {
+			/* For some reason, compressing new_page failed,
+			even though it should contain fewer records than
+			the original page.  Copy the page byte for byte
+			and then delete the records from both pages
+			as appropriate.  Deleting will always succeed. */
+			ut_a(new_page_zip);
+
+			page_zip_copy_recs(new_block,
+					   page_zip, page, cursor->index, mtr);
+			page_delete_rec_list_end(move_limit - page + new_page,
+						 new_block, cursor->index,
+						 ULINT_UNDEFINED,
+						 ULINT_UNDEFINED, mtr);
+
+			/* Update the lock table and possible hash index. */
+			lock_move_rec_list_start(
+				new_block, block, move_limit,
+				new_page + PAGE_NEW_INFIMUM);
+
+			btr_search_move_or_delete_hash_entries(
+				new_block, block);
+
+			/* Delete the records from the source page. */
+
+			page_delete_rec_list_start(move_limit, block,
+						   cursor->index, mtr);
+		}
+
+		left_block = new_block;
+		right_block = block;
+
+		if (!dict_table_is_locking_disabled(cursor->index->table)) {
+			lock_update_split_left(right_block, left_block);
+		}
+	} else {
+		/*		fputs("Split right\n", stderr); */
+
+		if (0
+#ifdef UNIV_ZIP_COPY
+		    || page_zip
+#endif /* UNIV_ZIP_COPY */
+		    || !page_move_rec_list_end(new_block, block, move_limit,
+					       cursor->index, mtr)) {
+			/* For some reason, compressing new_page failed,
+			even though it should contain fewer records than
+			the original page.  Copy the page byte for byte
+			and then delete the records from both pages
+			as appropriate.  Deleting will always succeed. */
+			ut_a(new_page_zip);
+
+			page_zip_copy_recs(new_block,
+					   page_zip, page, cursor->index, mtr);
+			page_delete_rec_list_start(move_limit - page
+						   + new_page, new_block,
+						   cursor->index, mtr);
+
+			/* Update the lock table and possible hash index. */
+			lock_move_rec_list_end(new_block, block, move_limit);
+
+			btr_search_move_or_delete_hash_entries(
+				new_block, block);
+
+			/* Delete the records from the source page. */
+
+			page_delete_rec_list_end(move_limit, block,
+						 cursor->index,
+						 ULINT_UNDEFINED,
+						 ULINT_UNDEFINED, mtr);
+		}
+
+		left_block = block;
+		right_block = new_block;
+
+		if (!dict_table_is_locking_disabled(cursor->index->table)) {
+			lock_update_split_right(right_block, left_block);
+		}
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	if (page_zip) {
+		ut_a(page_zip_validate(page_zip, page, cursor->index));
+		ut_a(page_zip_validate(new_page_zip, new_page, cursor->index));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	/* At this point, split_rec, move_limit and first_rec may point
+	to garbage on the old page. */
+
+	/* 6. The split and the tree modification is now completed. Decide the
+	page where the tuple should be inserted */
+	rec_t* rec;
+	buf_block_t* const insert_block = insert_left
+		? left_block : right_block;
+
+	if (UNIV_UNLIKELY(!tuple)) {
+		rec = NULL;
+		goto func_exit;
+	}
+
+	/* 7. Reposition the cursor for insert and try insertion */
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	page_cur_search(insert_block, cursor->index, tuple, page_cursor);
+
+	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
+				    offsets, heap, n_ext, mtr);
+
+#ifdef UNIV_ZIP_DEBUG
+	{
+		page_t*		insert_page
+			= buf_block_get_frame(insert_block);
+
+		page_zip_des_t*	insert_page_zip
+			= buf_block_get_page_zip(insert_block);
+
+		ut_a(!insert_page_zip
+		     || page_zip_validate(insert_page_zip, insert_page,
+					  cursor->index));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (rec != NULL) {
+
+		goto func_exit;
+	}
+
+	/* 8. If insert did not fit, try page reorganization.
+	For compressed pages, page_cur_tuple_insert() will have
+	attempted this already. */
+
+	if (page_cur_get_page_zip(page_cursor)
+	    || !btr_page_reorganize(page_cursor, cursor->index, mtr)) {
+
+		goto insert_failed;
+	}
+
+	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
+				    offsets, heap, n_ext, mtr);
+
+	if (rec == NULL) {
+		/* The insert did not fit on the page: loop back to the
+		start of the function for a new split */
+insert_failed:
+		/* We play safe and reset the free bits for new_page */
+		if (!dict_index_is_clust(cursor->index)
+		    && !cursor->index->table->is_temporary()) {
+			ibuf_reset_free_bits(new_block);
+			ibuf_reset_free_bits(block);
+		}
+
+		n_iterations++;
+		ut_ad(n_iterations < 2
+		      || buf_block_get_page_zip(insert_block));
+		ut_ad(!insert_will_fit);
+
+		goto func_start;
+	}
+
+func_exit:
+	/* Insert fit on the page: update the free bits for the
+	left and right pages in the same mtr */
+
+	if (!dict_index_is_clust(cursor->index)
+	    && !cursor->index->table->is_temporary()
+	    && page_is_leaf(page)) {
+
+		ibuf_update_free_bits_for_two_pages_low(
+			left_block, right_block, mtr);
+	}
+
+	MONITOR_INC(MONITOR_INDEX_SPLIT);
+
+	ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index));
+	ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index));
+
+	ut_ad(tuple || !rec);
+	ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
+	return(rec);
+}
+
+/** Remove a page from the level list of pages.
+@param[in]	block		page to remove
+@param[in]	index		index tree
+@param[in,out]	mtr		mini-transaction */
+void btr_level_list_remove(const buf_block_t& block, const dict_index_t& index,
+			   mtr_t* mtr)
+{
+	ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(block.zip_size() == index.table->space->zip_size());
+	ut_ad(index.table->space->id == block.page.id().space());
+	/* Get the previous and next page numbers of page */
+
+	const page_t* page = block.frame;
+	const uint32_t	prev_page_no = btr_page_get_prev(page);
+	const uint32_t	next_page_no = btr_page_get_next(page);
+
+	/* Update page links of the level */
+
+	if (prev_page_no != FIL_NULL) {
+		buf_block_t*	prev_block = btr_block_get(
+			index, prev_page_no, RW_X_LATCH, page_is_leaf(page),
+			mtr);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(prev_block->frame) == page_is_comp(page));
+		static_assert(FIL_PAGE_NEXT % 4 == 0, "alignment");
+		static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+		ut_a(!memcmp_aligned<4>(prev_block->frame + FIL_PAGE_NEXT,
+					page + FIL_PAGE_OFFSET, 4));
+#endif /* UNIV_BTR_DEBUG */
+
+		btr_page_set_next(prev_block, next_page_no, mtr);
+	}
+
+	if (next_page_no != FIL_NULL) {
+		buf_block_t*	next_block = btr_block_get(
+			index, next_page_no, RW_X_LATCH, page_is_leaf(page),
+			mtr);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(next_block->frame) == page_is_comp(page));
+		static_assert(FIL_PAGE_PREV % 4 == 0, "alignment");
+		static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+		ut_a(!memcmp_aligned<4>(next_block->frame + FIL_PAGE_PREV,
+					page + FIL_PAGE_OFFSET, 4));
+#endif /* UNIV_BTR_DEBUG */
+
+		btr_page_set_prev(next_block, prev_page_no, mtr);
+	}
+}
+
+/*************************************************************//**
+If page is the only on its level, this function moves its records to the
+father page, thus reducing the tree height.
+@return father block */
+UNIV_INTERN
+buf_block_t*
+btr_lift_page_up(
+/*=============*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: page which is the only on its level;
+				must not be empty: use
+				btr_discard_only_page_on_level if the last
+				record from the page should be removed */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	buf_block_t*	father_block;
+	ulint		page_level;
+	page_zip_des_t*	father_page_zip;
+	page_t*		page		= buf_block_get_frame(block);
+	ulint		root_page_no;
+	buf_block_t*	blocks[BTR_MAX_LEVELS];
+	ulint		n_blocks;	/*!< last used index in blocks[] */
+	ulint		i;
+	bool		lift_father_up;
+	buf_block_t*	block_orig	= block;
+
+	ut_ad(!page_has_siblings(page));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+
+	page_level = btr_page_get_level(page);
+	root_page_no = dict_index_get_page(index);
+
+	{
+		btr_cur_t	cursor;
+		rec_offs*	offsets	= NULL;
+		mem_heap_t*	heap	= mem_heap_create(
+			sizeof(*offsets)
+			* (REC_OFFS_HEADER_SIZE + 1 + 1
+			   + unsigned(index->n_fields)));
+		buf_block_t*	b;
+
+		if (dict_index_is_spatial(index)) {
+			offsets = rtr_page_get_father_block(
+				NULL, heap, index, block, mtr,
+				NULL, &cursor);
+		} else {
+			offsets = btr_page_get_father_block(offsets, heap,
+							    index, block,
+							    mtr, &cursor);
+		}
+		father_block = btr_cur_get_block(&cursor);
+		father_page_zip = buf_block_get_page_zip(father_block);
+
+		n_blocks = 0;
+
+		/* Store all ancestor pages so we can reset their
+		levels later on.  We have to do all the searches on
+		the tree now because later on, after we've replaced
+		the first level, the tree is in an inconsistent state
+		and can not be searched. */
+		for (b = father_block;
+		     b->page.id().page_no() != root_page_no; ) {
+			ut_a(n_blocks < BTR_MAX_LEVELS);
+
+			if (dict_index_is_spatial(index)) {
+				offsets = rtr_page_get_father_block(
+					NULL, heap, index, b, mtr,
+					NULL, &cursor);
+			} else {
+				offsets = btr_page_get_father_block(offsets,
+								    heap,
+								    index, b,
+								    mtr,
+								    &cursor);
+			}
+
+			blocks[n_blocks++] = b = btr_cur_get_block(&cursor);
+		}
+
+		lift_father_up = (n_blocks && page_level == 0);
+		if (lift_father_up) {
+			/* The father page also should be the only on its level (not
+			root). We should lift up the father page at first.
+			Because the leaf page should be lifted up only for root page.
+			The freeing page is based on page_level (==0 or !=0)
+			to choose segment. If the page_level is changed ==0 from !=0,
+			later freeing of the page doesn't find the page allocation
+			to be freed.*/
+
+			block = father_block;
+			page = buf_block_get_frame(block);
+			page_level = btr_page_get_level(page);
+
+			ut_ad(!page_has_siblings(page));
+			ut_ad(mtr->memo_contains_flagged(block,
+							 MTR_MEMO_PAGE_X_FIX));
+
+			father_block = blocks[0];
+			father_page_zip = buf_block_get_page_zip(father_block);
+		}
+
+		mem_heap_free(heap);
+	}
+
+	btr_search_drop_page_hash_index(block);
+
+	/* Make the father empty */
+	btr_page_empty(father_block, father_page_zip, index, page_level, mtr);
+	/* btr_page_empty() is supposed to zero-initialize the field. */
+	ut_ad(!page_get_instant(father_block->frame));
+
+	if (index->is_instant()
+	    && father_block->page.id().page_no() == root_page_no) {
+		ut_ad(!father_page_zip);
+		btr_set_instant(father_block, *index, mtr);
+	}
+
+	page_level++;
+
+	/* Copy the records to the father page one by one. */
+	if (0
+#ifdef UNIV_ZIP_COPY
+	    || father_page_zip
+#endif /* UNIV_ZIP_COPY */
+	    || !page_copy_rec_list_end(father_block, block,
+				       page_get_infimum_rec(page),
+				       index, mtr)) {
+		const page_zip_des_t*	page_zip
+			= buf_block_get_page_zip(block);
+		ut_a(father_page_zip);
+		ut_a(page_zip);
+
+		/* Copy the page byte for byte. */
+		page_zip_copy_recs(father_block,
+				   page_zip, page, index, mtr);
+
+		/* Update the lock table and possible hash index. */
+
+		lock_move_rec_list_end(father_block, block,
+				       page_get_infimum_rec(page));
+
+		/* Also update the predicate locks */
+		if (dict_index_is_spatial(index)) {
+			lock_prdt_rec_move(father_block, block);
+		} else {
+			btr_search_move_or_delete_hash_entries(
+				father_block, block);
+		}
+	}
+
+	if (!dict_table_is_locking_disabled(index->table)) {
+		/* Free predicate page locks on the block */
+		if (dict_index_is_spatial(index)) {
+			lock_mutex_enter();
+			lock_prdt_page_free_from_discard(
+				block, &lock_sys.prdt_page_hash);
+			lock_mutex_exit();
+		}
+		lock_update_copy_and_discard(father_block, block);
+	}
+
+	/* Go upward to root page, decrementing levels by one. */
+	for (i = lift_father_up ? 1 : 0; i < n_blocks; i++, page_level++) {
+		ut_ad(btr_page_get_level(blocks[i]->frame) == page_level + 1);
+		btr_page_set_level(blocks[i], page_level, mtr);
+	}
+
+	if (dict_index_is_spatial(index)) {
+		rtr_check_discard_page(index, NULL, block);
+	}
+
+	/* Free the file page */
+	btr_page_free(index, block, mtr);
+
+	/* We play it safe and reset the free bits for the father */
+	if (!dict_index_is_clust(index)
+	    && !index->table->is_temporary()) {
+		ibuf_reset_free_bits(father_block);
+	}
+	ut_ad(page_validate(father_block->frame, index));
+	ut_ad(btr_check_node_ptr(index, father_block, mtr));
+
+	return(lift_father_up ? block_orig : father_block);
+}
+
+/*************************************************************//**
+Tries to merge the page first to the left immediate brother if such a
+brother exists, and the node pointers to the current page and to the brother
+reside on the same page. If the left brother does not satisfy these
+conditions, looks at the right brother. If the page is the only one on that
+level lifts the records of the page to the father page, thus reducing the
+tree height. It is assumed that mtr holds an x-latch on the tree and on the
+page. If cursor is on the leaf level, mtr must also hold x-latches to the
+brothers, if they exist.
+@return TRUE on success */
+ibool
+btr_compress(
+/*=========*/
+	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to merge
+				or lift; the page must not be empty:
+				when deleting records, use btr_discard_page()
+				if the page would become empty */
+	ibool		adjust,	/*!< in: TRUE if should adjust the
+				cursor position even if compression occurs */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	dict_index_t*	index;
+	buf_block_t*	merge_block;
+	page_t*		merge_page = NULL;
+	page_zip_des_t*	merge_page_zip;
+	ibool		is_left;
+	buf_block_t*	block;
+	page_t*		page;
+	btr_cur_t	father_cursor;
+	mem_heap_t*	heap;
+	rec_offs*	offsets;
+	ulint		nth_rec = 0; /* remove bogus warning */
+	bool		mbr_changed = false;
+#ifdef UNIV_DEBUG
+	bool		leftmost_child;
+#endif
+	DBUG_ENTER("btr_compress");
+
+	block = btr_cur_get_block(cursor);
+	page = btr_cur_get_page(cursor);
+	index = btr_cur_get_index(cursor);
+
+	btr_assert_not_corrupted(block, index);
+
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+
+	MONITOR_INC(MONITOR_INDEX_MERGE_ATTEMPTS);
+
+	const uint32_t left_page_no = btr_page_get_prev(page);
+	const uint32_t right_page_no = btr_page_get_next(page);
+
+#ifdef UNIV_DEBUG
+	if (!page_is_leaf(page) && left_page_no == FIL_NULL) {
+		ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+			page_rec_get_next(page_get_infimum_rec(page)),
+			page_is_comp(page)));
+	}
+#endif /* UNIV_DEBUG */
+
+	heap = mem_heap_create(100);
+
+	if (dict_index_is_spatial(index)) {
+		offsets = rtr_page_get_father_block(
+			NULL, heap, index, block, mtr, cursor, &father_cursor);
+		ut_ad(cursor->page_cur.block->page.id() == block->page.id());
+		rec_t*  my_rec = father_cursor.page_cur.rec;
+
+		ulint page_no = btr_node_ptr_get_child_page_no(my_rec, offsets);
+
+		if (page_no != block->page.id().page_no()) {
+			ib::info() << "father positioned on page "
+				<< page_no << "instead of "
+				<< block->page.id().page_no();
+			offsets = btr_page_get_father_block(
+				NULL, heap, index, block, mtr, &father_cursor);
+		}
+	} else {
+		offsets = btr_page_get_father_block(
+			NULL, heap, index, block, mtr, &father_cursor);
+	}
+
+	if (adjust) {
+		nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor));
+		ut_ad(nth_rec > 0);
+	}
+
+	if (left_page_no == FIL_NULL && right_page_no == FIL_NULL) {
+		/* The page is the only one on the level, lift the records
+		to the father */
+
+		merge_block = btr_lift_page_up(index, block, mtr);
+		goto func_exit;
+	}
+
+	ut_d(leftmost_child =
+		left_page_no != FIL_NULL
+		&& (page_rec_get_next(
+			page_get_infimum_rec(
+				btr_cur_get_page(&father_cursor)))
+		    == btr_cur_get_rec(&father_cursor)));
+
+	/* Decide the page to which we try to merge and which will inherit
+	the locks */
+
+	is_left = btr_can_merge_with_page(cursor, left_page_no,
+					  &merge_block, mtr);
+
+	DBUG_EXECUTE_IF("ib_always_merge_right", is_left = FALSE;);
+retry:
+	if (!is_left
+	   && !btr_can_merge_with_page(cursor, right_page_no, &merge_block,
+				       mtr)) {
+		if (!merge_block) {
+			merge_page = NULL;
+		}
+		goto err_exit;
+	}
+
+	merge_page = buf_block_get_frame(merge_block);
+
+#ifdef UNIV_BTR_DEBUG
+	if (is_left) {
+		ut_a(btr_page_get_next(merge_page)
+		     == block->page.id().page_no());
+	} else {
+		ut_a(btr_page_get_prev(merge_page)
+		     == block->page.id().page_no());
+	}
+#endif /* UNIV_BTR_DEBUG */
+
+	ut_ad(page_validate(merge_page, index));
+
+	merge_page_zip = buf_block_get_page_zip(merge_block);
+#ifdef UNIV_ZIP_DEBUG
+	if (merge_page_zip) {
+		const page_zip_des_t*	page_zip
+			= buf_block_get_page_zip(block);
+		ut_a(page_zip);
+		ut_a(page_zip_validate(merge_page_zip, merge_page, index));
+		ut_a(page_zip_validate(page_zip, page, index));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	/* Move records to the merge page */
+	if (is_left) {
+		btr_cur_t	cursor2;
+		rtr_mbr_t	new_mbr;
+		rec_offs*	offsets2 = NULL;
+
+		/* For rtree, we need to update father's mbr. */
+		if (index->is_spatial()) {
+			/* We only support merge pages with the same parent
+			page */
+			if (!rtr_check_same_block(
+				index, &cursor2,
+				btr_cur_get_block(&father_cursor),
+				merge_block, heap)) {
+				is_left = false;
+				goto retry;
+			}
+
+			/* Set rtr_info for cursor2, since it is
+			necessary in recursive page merge. */
+			cursor2.rtr_info = cursor->rtr_info;
+			cursor2.tree_height = cursor->tree_height;
+
+			offsets2 = rec_get_offsets(
+				btr_cur_get_rec(&cursor2), index, NULL,
+				page_is_leaf(cursor2.page_cur.block->frame)
+				? index->n_fields : 0,
+				ULINT_UNDEFINED, &heap);
+
+			/* Check if parent entry needs to be updated */
+			mbr_changed = rtr_merge_mbr_changed(
+				&cursor2, &father_cursor,
+				offsets2, offsets, &new_mbr);
+		}
+
+		rec_t*	orig_pred = page_copy_rec_list_start(
+			merge_block, block, page_get_supremum_rec(page),
+			index, mtr);
+
+		if (!orig_pred) {
+			goto err_exit;
+		}
+
+		btr_search_drop_page_hash_index(block);
+
+		/* Remove the page from the level list */
+		btr_level_list_remove(*block, *index, mtr);
+
+		if (dict_index_is_spatial(index)) {
+			rec_t*  my_rec = father_cursor.page_cur.rec;
+
+			ulint page_no = btr_node_ptr_get_child_page_no(
+						my_rec, offsets);
+
+			if (page_no != block->page.id().page_no()) {
+				ib::fatal() << "father positioned on "
+					<< page_no << " instead of "
+					<< block->page.id().page_no();
+			}
+
+			if (mbr_changed) {
+#ifdef UNIV_DEBUG
+				bool	success = rtr_update_mbr_field(
+					&cursor2, offsets2, &father_cursor,
+					merge_page, &new_mbr, NULL, mtr);
+
+				ut_ad(success);
+#else
+				rtr_update_mbr_field(
+					&cursor2, offsets2, &father_cursor,
+					merge_page, &new_mbr, NULL, mtr);
+#endif
+			} else {
+				rtr_node_ptr_delete(&father_cursor, mtr);
+			}
+
+			/* No GAP lock needs to be worrying about */
+			lock_mutex_enter();
+			lock_prdt_page_free_from_discard(
+				block, &lock_sys.prdt_page_hash);
+			lock_rec_free_all_from_discard_page(block);
+			lock_mutex_exit();
+		} else {
+			btr_cur_node_ptr_delete(&father_cursor, mtr);
+			if (!dict_table_is_locking_disabled(index->table)) {
+				lock_update_merge_left(
+					merge_block, orig_pred, block);
+			}
+		}
+
+		if (adjust) {
+			nth_rec += page_rec_get_n_recs_before(orig_pred);
+		}
+	} else {
+		rec_t*		orig_succ;
+		ibool		compressed;
+		dberr_t		err;
+		btr_cur_t	cursor2;
+					/* father cursor pointing to node ptr
+					of the right sibling */
+#ifdef UNIV_BTR_DEBUG
+		byte		fil_page_prev[4];
+#endif /* UNIV_BTR_DEBUG */
+
+		if (dict_index_is_spatial(index)) {
+			cursor2.rtr_info = NULL;
+
+			/* For spatial index, we disallow merge of blocks
+			with different parents, since the merge would need
+			to update entry (for MBR and Primary key) in the
+			parent of block being merged */
+			if (!rtr_check_same_block(
+				index, &cursor2,
+				btr_cur_get_block(&father_cursor),
+				merge_block, heap)) {
+				goto err_exit;
+			}
+
+			/* Set rtr_info for cursor2, since it is
+			necessary in recursive page merge. */
+			cursor2.rtr_info = cursor->rtr_info;
+			cursor2.tree_height = cursor->tree_height;
+		} else {
+			btr_page_get_father(index, merge_block, mtr, &cursor2);
+		}
+
+		if (merge_page_zip && left_page_no == FIL_NULL) {
+
+			/* The function page_zip_compress(), which will be
+			invoked by page_copy_rec_list_end() below,
+			requires that FIL_PAGE_PREV be FIL_NULL.
+			Clear the field, but prepare to restore it. */
+			static_assert(FIL_PAGE_PREV % 8 == 0, "alignment");
+#ifdef UNIV_BTR_DEBUG
+			memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4);
+#endif /* UNIV_BTR_DEBUG */
+			compile_time_assert(FIL_NULL == 0xffffffffU);
+			memset_aligned<4>(merge_page + FIL_PAGE_PREV, 0xff, 4);
+		}
+
+		orig_succ = page_copy_rec_list_end(merge_block, block,
+						   page_get_infimum_rec(page),
+						   cursor->index, mtr);
+
+		if (!orig_succ) {
+			ut_a(merge_page_zip);
+#ifdef UNIV_BTR_DEBUG
+			if (left_page_no == FIL_NULL) {
+				/* FIL_PAGE_PREV was restored from
+				merge_page_zip. */
+				ut_a(!memcmp(fil_page_prev,
+					     merge_page + FIL_PAGE_PREV, 4));
+			}
+#endif /* UNIV_BTR_DEBUG */
+			goto err_exit;
+		}
+
+		btr_search_drop_page_hash_index(block);
+
+#ifdef UNIV_BTR_DEBUG
+		if (merge_page_zip && left_page_no == FIL_NULL) {
+
+			/* Restore FIL_PAGE_PREV in order to avoid an assertion
+			failure in btr_level_list_remove(), which will set
+			the field again to FIL_NULL.  Even though this makes
+			merge_page and merge_page_zip inconsistent for a
+			split second, it is harmless, because the pages
+			are X-latched. */
+			memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4);
+		}
+#endif /* UNIV_BTR_DEBUG */
+
+		/* Remove the page from the level list */
+		btr_level_list_remove(*block, *index, mtr);
+
+		ut_ad(btr_node_ptr_get_child_page_no(
+			      btr_cur_get_rec(&father_cursor), offsets)
+		      == block->page.id().page_no());
+
+		/* Replace the address of the old child node (= page) with the
+		address of the merge page to the right */
+		btr_node_ptr_set_child_page_no(
+			btr_cur_get_block(&father_cursor),
+			btr_cur_get_rec(&father_cursor),
+			offsets, right_page_no, mtr);
+
+#ifdef UNIV_DEBUG
+		if (!page_is_leaf(page) && left_page_no == FIL_NULL) {
+			ut_ad(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+				page_rec_get_next(page_get_infimum_rec(
+					buf_block_get_frame(merge_block))),
+				page_is_comp(page)));
+		}
+#endif /* UNIV_DEBUG */
+
+		/* For rtree, we need to update father's mbr. */
+		if (index->is_spatial()) {
+			rec_offs* offsets2;
+			ulint	rec_info;
+
+			offsets2 = rec_get_offsets(
+				btr_cur_get_rec(&cursor2), index, NULL,
+				page_is_leaf(cursor2.page_cur.block->frame)
+				? index->n_fields : 0,
+				ULINT_UNDEFINED, &heap);
+
+			ut_ad(btr_node_ptr_get_child_page_no(
+				btr_cur_get_rec(&cursor2), offsets2)
+				== right_page_no);
+
+			rec_info = rec_get_info_bits(
+				btr_cur_get_rec(&father_cursor),
+				rec_offs_comp(offsets));
+			if (rec_info & REC_INFO_MIN_REC_FLAG) {
+				/* When the father node ptr is minimal rec,
+				we will keep it and delete the node ptr of
+				merge page. */
+				rtr_merge_and_update_mbr(&father_cursor,
+							 &cursor2,
+							 offsets, offsets2,
+							 merge_page, mtr);
+			} else {
+				/* Otherwise, we will keep the node ptr of
+				merge page and delete the father node ptr.
+				This is for keeping the rec order in upper
+				level. */
+				rtr_merge_and_update_mbr(&cursor2,
+							 &father_cursor,
+							 offsets2, offsets,
+							 merge_page, mtr);
+			}
+			lock_mutex_enter();
+			lock_prdt_page_free_from_discard(
+				block, &lock_sys.prdt_page_hash);
+			lock_rec_free_all_from_discard_page(block);
+			lock_mutex_exit();
+		} else {
+
+			compressed = btr_cur_pessimistic_delete(&err, TRUE,
+								&cursor2,
+								BTR_CREATE_FLAG,
+								false, mtr);
+			ut_a(err == DB_SUCCESS);
+
+			if (!compressed) {
+				btr_cur_compress_if_useful(&cursor2,
+							   FALSE,
+							   mtr);
+			}
+
+			if (!dict_table_is_locking_disabled(index->table)) {
+				lock_update_merge_right(
+					merge_block, orig_succ, block);
+			}
+		}
+	}
+
+	if (!dict_index_is_clust(index)
+	    && !index->table->is_temporary()
+	    && page_is_leaf(merge_page)) {
+		/* Update the free bits of the B-tree page in the
+		insert buffer bitmap.  This has to be done in a
+		separate mini-transaction that is committed before the
+		main mini-transaction.  We cannot update the insert
+		buffer bitmap in this mini-transaction, because
+		btr_compress() can be invoked recursively without
+		committing the mini-transaction in between.  Since
+		insert buffer bitmap pages have a lower rank than
+		B-tree pages, we must not access other pages in the
+		same mini-transaction after accessing an insert buffer
+		bitmap page. */
+
+		/* The free bits in the insert buffer bitmap must
+		never exceed the free space on a page.  It is safe to
+		decrement or reset the bits in the bitmap in a
+		mini-transaction that is committed before the
+		mini-transaction that affects the free space. */
+
+		/* It is unsafe to increment the bits in a separately
+		committed mini-transaction, because in crash recovery,
+		the free bits could momentarily be set too high. */
+
+		if (merge_block->zip_size()) {
+			/* Because the free bits may be incremented
+			and we cannot update the insert buffer bitmap
+			in the same mini-transaction, the only safe
+			thing we can do here is the pessimistic
+			approach: reset the free bits. */
+			ibuf_reset_free_bits(merge_block);
+		} else {
+			/* On uncompressed pages, the free bits will
+			never increase here.  Thus, it is safe to
+			write the bits accurately in a separate
+			mini-transaction. */
+			ibuf_update_free_bits_if_full(merge_block,
+						      srv_page_size,
+						      ULINT_UNDEFINED);
+		}
+	}
+
+	ut_ad(page_validate(merge_page, index));
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page,
+						  index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (dict_index_is_spatial(index)) {
+		rtr_check_discard_page(index, NULL, block);
+	}
+
+	/* Free the file page */
+	btr_page_free(index, block, mtr);
+
+	/* btr_check_node_ptr() needs parent block latched.
+	If the merge_block's parent block is not same,
+	we cannot use btr_check_node_ptr() */
+	ut_ad(leftmost_child
+	      || btr_check_node_ptr(index, merge_block, mtr));
+func_exit:
+	mem_heap_free(heap);
+
+	if (adjust) {
+		ut_ad(nth_rec > 0);
+		btr_cur_position(
+			index,
+			page_rec_get_nth(merge_block->frame, nth_rec),
+			merge_block, cursor);
+	}
+
+	MONITOR_INC(MONITOR_INDEX_MERGE_SUCCESSFUL);
+
+	DBUG_RETURN(TRUE);
+
+err_exit:
+	/* We play it safe and reset the free bits. */
+	if (merge_block && merge_block->zip_size()
+	    && page_is_leaf(merge_block->frame)
+	    && !dict_index_is_clust(index)) {
+
+		ibuf_reset_free_bits(merge_block);
+	}
+
+	mem_heap_free(heap);
+	DBUG_RETURN(FALSE);
+}
+
+/*************************************************************//**
+Discards a page that is the only page on its level.  This will empty
+the whole B-tree, leaving just an empty root page.  This function
+should almost never be reached, because btr_compress(), which is invoked in
+delete operations, calls btr_lift_page_up() to flatten the B-tree. */
+ATTRIBUTE_COLD
+static
+void
+btr_discard_only_page_on_level(
+/*===========================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: page which is the only on its level */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint		page_level = 0;
+
+	ut_ad(!index->is_dummy);
+
+	/* Save the PAGE_MAX_TRX_ID from the leaf page. */
+	const trx_id_t max_trx_id = page_get_max_trx_id(block->frame);
+	const rec_t* r = page_rec_get_next(page_get_infimum_rec(block->frame));
+	ut_ad(rec_is_metadata(r, *index) == index->is_instant());
+
+	while (block->page.id().page_no() != dict_index_get_page(index)) {
+		btr_cur_t	cursor;
+		buf_block_t*	father;
+		const page_t*	page	= buf_block_get_frame(block);
+
+		ut_a(page_get_n_recs(page) == 1);
+		ut_a(page_level == btr_page_get_level(page));
+		ut_a(!page_has_siblings(page));
+		ut_ad(fil_page_index_page_check(page));
+		ut_ad(block->page.id().space() == index->table->space->id);
+		ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+		btr_search_drop_page_hash_index(block);
+
+		if (dict_index_is_spatial(index)) {
+			/* Check any concurrent search having this page */
+			rtr_check_discard_page(index, NULL, block);
+			rtr_page_get_father(index, block, mtr, NULL, &cursor);
+		} else {
+			btr_page_get_father(index, block, mtr, &cursor);
+		}
+		father = btr_cur_get_block(&cursor);
+
+		if (!dict_table_is_locking_disabled(index->table)) {
+			lock_update_discard(
+				father, PAGE_HEAP_NO_SUPREMUM, block);
+		}
+
+		/* Free the file page */
+		btr_page_free(index, block, mtr);
+
+		block = father;
+		page_level++;
+	}
+
+	/* block is the root page, which must be empty, except
+	for the node pointer to the (now discarded) block(s). */
+	ut_ad(!page_has_siblings(block->frame));
+
+#ifdef UNIV_BTR_DEBUG
+	if (!dict_index_is_ibuf(index)) {
+		const page_t*	root	= buf_block_get_frame(block);
+		const ulint	space	= index->table->space_id;
+		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+					    + root, space));
+		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+					    + root, space));
+	}
+#endif /* UNIV_BTR_DEBUG */
+
+	mem_heap_t* heap = nullptr;
+	const rec_t* rec = nullptr;
+	rec_offs* offsets = nullptr;
+	if (index->table->instant || index->must_avoid_clear_instant_add()) {
+		if (!rec_is_metadata(r, *index)) {
+		} else if (!index->table->instant
+			   || rec_is_alter_metadata(r, *index)) {
+			heap = mem_heap_create(srv_page_size);
+			offsets = rec_get_offsets(r, index, nullptr,
+						  index->n_core_fields,
+						  ULINT_UNDEFINED, &heap);
+			rec = rec_copy(mem_heap_alloc(heap,
+						      rec_offs_size(offsets)),
+				       r, offsets);
+			rec_offs_make_valid(rec, index, true, offsets);
+		}
+	}
+
+	btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr);
+	ut_ad(page_is_leaf(buf_block_get_frame(block)));
+	/* btr_page_empty() is supposed to zero-initialize the field. */
+	ut_ad(!page_get_instant(block->frame));
+
+	if (index->is_primary()) {
+		if (rec) {
+			page_cur_t cur;
+			page_cur_set_before_first(block, &cur);
+			DBUG_ASSERT(index->table->instant);
+			DBUG_ASSERT(rec_is_alter_metadata(rec, *index));
+			btr_set_instant(block, *index, mtr);
+			rec = page_cur_insert_rec_low(&cur, index, rec,
+						      offsets, mtr);
+			ut_ad(rec);
+			mem_heap_free(heap);
+		} else if (index->is_instant()) {
+			index->clear_instant_add();
+		}
+	} else if (!index->table->is_temporary()) {
+		/* We play it safe and reset the free bits for the root */
+		ibuf_reset_free_bits(block);
+
+		ut_a(max_trx_id);
+		page_set_max_trx_id(block,
+				    buf_block_get_page_zip(block),
+				    max_trx_id, mtr);
+	}
+}
+
+/*************************************************************//**
+Discards a page from a B-tree. This is used to remove the last record from
+a B-tree page: the whole page must be removed at the same time. This cannot
+be used for the root page, which is allowed to be empty. */
+void
+btr_discard_page(
+/*=============*/
+	btr_cur_t*	cursor,	/*!< in: cursor on the page to discard: not on
+				the root page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_index_t*	index;
+	buf_block_t*	merge_block;
+	buf_block_t*	block;
+	btr_cur_t	parent_cursor;
+
+	block = btr_cur_get_block(cursor);
+	index = btr_cur_get_index(cursor);
+
+	ut_ad(dict_index_get_page(index) != block->page.id().page_no());
+
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+
+	MONITOR_INC(MONITOR_INDEX_DISCARD);
+
+	if (dict_index_is_spatial(index)) {
+		rtr_page_get_father(index, block, mtr, cursor, &parent_cursor);
+	} else {
+		btr_page_get_father(index, block, mtr, &parent_cursor);
+	}
+
+	/* Decide the page which will inherit the locks */
+
+	const uint32_t left_page_no = btr_page_get_prev(block->frame);
+	const uint32_t right_page_no = btr_page_get_next(block->frame);
+
+	ut_d(bool parent_is_different = false);
+	if (left_page_no != FIL_NULL) {
+		merge_block = btr_block_get(*index, left_page_no, RW_X_LATCH,
+					    true, mtr);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(btr_page_get_next(merge_block->frame)
+		     == block->page.id().page_no());
+#endif /* UNIV_BTR_DEBUG */
+		ut_d(parent_is_different =
+			(page_rec_get_next(
+				page_get_infimum_rec(
+					btr_cur_get_page(
+						&parent_cursor)))
+			 == btr_cur_get_rec(&parent_cursor)));
+	} else if (right_page_no != FIL_NULL) {
+		merge_block = btr_block_get(*index, right_page_no, RW_X_LATCH,
+					    true, mtr);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(btr_page_get_prev(merge_block->frame)
+		     == block->page.id().page_no());
+#endif /* UNIV_BTR_DEBUG */
+		ut_d(parent_is_different = page_rec_is_supremum(
+			page_rec_get_next(btr_cur_get_rec(&parent_cursor))));
+		if (!page_is_leaf(merge_block->frame)) {
+			rec_t* node_ptr = page_rec_get_next(
+				page_get_infimum_rec(merge_block->frame));
+			ut_ad(page_rec_is_user_rec(node_ptr));
+			/* We have to mark the leftmost node pointer as the
+			predefined minimum record. */
+			btr_set_min_rec_mark<true>(node_ptr, *merge_block,
+						   mtr);
+		}
+	} else {
+		btr_discard_only_page_on_level(index, block, mtr);
+
+		return;
+	}
+
+	ut_a(page_is_comp(merge_block->frame) == page_is_comp(block->frame));
+	ut_ad(!memcmp_aligned<2>(&merge_block->frame[PAGE_HEADER + PAGE_LEVEL],
+				 &block->frame[PAGE_HEADER + PAGE_LEVEL], 2));
+	btr_search_drop_page_hash_index(block);
+
+	if (dict_index_is_spatial(index)) {
+		rtr_node_ptr_delete(&parent_cursor, mtr);
+	} else {
+		btr_cur_node_ptr_delete(&parent_cursor, mtr);
+	}
+
+	/* Remove the page from the level list */
+	btr_level_list_remove(*block, *index, mtr);
+
+#ifdef UNIV_ZIP_DEBUG
+	{
+		page_zip_des_t*	merge_page_zip
+			= buf_block_get_page_zip(merge_block);
+		ut_a(!merge_page_zip
+		     || page_zip_validate(merge_page_zip, merge_block->frame,
+					  index));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (!dict_table_is_locking_disabled(index->table)) {
+		if (left_page_no != FIL_NULL) {
+			lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM,
+					    block);
+		} else {
+			lock_update_discard(merge_block,
+					    lock_get_min_heap_no(merge_block),
+					    block);
+		}
+	}
+
+	if (dict_index_is_spatial(index)) {
+		rtr_check_discard_page(index, cursor, block);
+	}
+
+	/* Free the file page */
+	btr_page_free(index, block, mtr);
+
+	/* btr_check_node_ptr() needs parent block latched.
+	If the merge_block's parent block is not same,
+	we cannot use btr_check_node_ptr() */
+	ut_ad(parent_is_different
+	      || btr_check_node_ptr(index, merge_block, mtr));
+
+	if (btr_cur_get_block(&parent_cursor)->page.id().page_no()
+	    == index->page
+	    && !page_has_siblings(btr_cur_get_page(&parent_cursor))
+	    && page_get_n_recs(btr_cur_get_page(&parent_cursor)) == 1) {
+		btr_lift_page_up(index, merge_block, mtr);
+	}
+}
+
+#ifdef UNIV_BTR_PRINT
+/*************************************************************//**
+Prints size info of a B-tree. */
+void
+btr_print_size(
+/*===========*/
+	dict_index_t*	index)	/*!< in: index tree */
+{
+	page_t*		root;
+	fseg_header_t*	seg;
+	mtr_t		mtr;
+
+	if (dict_index_is_ibuf(index)) {
+		fputs("Sorry, cannot print info of an ibuf tree:"
+		      " use ibuf functions\n", stderr);
+
+		return;
+	}
+
+	mtr_start(&mtr);
+
+	root = btr_root_get(index, &mtr);
+
+	seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
+
+	fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr);
+	fseg_print(seg, &mtr);
+
+	if (!dict_index_is_ibuf(index)) {
+
+		seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+
+		fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr);
+		fseg_print(seg, &mtr);
+	}
+
+	mtr_commit(&mtr);
+}
+
+/************************************************************//**
+Prints recursively index tree pages. */
+static
+void
+btr_print_recursive(
+/*================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: index page */
+	ulint		width,	/*!< in: print this many entries from start
+				and end */
+	mem_heap_t**	heap,	/*!< in/out: heap for rec_get_offsets() */
+	rec_offs**	offsets,/*!< in/out: buffer for rec_get_offsets() */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	const page_t*	page	= buf_block_get_frame(block);
+	page_cur_t	cursor;
+	ulint		n_recs;
+	ulint		i	= 0;
+	mtr_t		mtr2;
+
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_SX_FIX));
+
+	ib::info() << "NODE ON LEVEL " << btr_page_get_level(page)
+		<< " page " << block->page.id;
+
+	page_print(block, index, width, width);
+
+	n_recs = page_get_n_recs(page);
+
+	page_cur_set_before_first(block, &cursor);
+	page_cur_move_to_next(&cursor);
+
+	while (!page_cur_is_after_last(&cursor)) {
+
+		if (page_is_leaf(page)) {
+
+			/* If this is the leaf level, do nothing */
+
+		} else if ((i <= width) || (i >= n_recs - width)) {
+
+			const rec_t*	node_ptr;
+
+			mtr_start(&mtr2);
+
+			node_ptr = page_cur_get_rec(&cursor);
+
+			*offsets = rec_get_offsets(
+				node_ptr, index, *offsets, 0,
+				ULINT_UNDEFINED, heap);
+			btr_print_recursive(index,
+					    btr_node_ptr_get_child(node_ptr,
+								   index,
+								   *offsets,
+								   &mtr2),
+					    width, heap, offsets, &mtr2);
+			mtr_commit(&mtr2);
+		}
+
+		page_cur_move_to_next(&cursor);
+		i++;
+	}
+}
+
+/**************************************************************//**
+Prints directories and other info of all nodes in the tree. */
+void
+btr_print_index(
+/*============*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		width)	/*!< in: print this many entries from start
+				and end */
+{
+	mtr_t		mtr;
+	buf_block_t*	root;
+	mem_heap_t*	heap	= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets	= offsets_;
+	rec_offs_init(offsets_);
+
+	fputs("--------------------------\n"
+	      "INDEX TREE PRINT\n", stderr);
+
+	mtr_start(&mtr);
+
+	root = btr_root_block_get(index, RW_SX_LATCH, &mtr);
+
+	btr_print_recursive(index, root, width, &heap, &offsets, &mtr);
+	if (heap) {
+		mem_heap_free(heap);
+	}
+
+	mtr_commit(&mtr);
+
+	ut_ad(btr_validate_index(index, 0));
+}
+#endif /* UNIV_BTR_PRINT */
+
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Checks that the node pointer to a page is appropriate.
+@return TRUE */
+ibool
+btr_check_node_ptr(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: index page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	tuple;
+	rec_offs*	offsets;
+	btr_cur_t	cursor;
+	page_t*		page = buf_block_get_frame(block);
+
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+
+	if (dict_index_get_page(index) == block->page.id().page_no()) {
+
+		return(TRUE);
+	}
+
+	heap = mem_heap_create(256);
+
+	if (dict_index_is_spatial(index)) {
+		offsets = rtr_page_get_father_block(NULL, heap, index, block, mtr,
+						    NULL, &cursor);
+	} else {
+		offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
+						    &cursor);
+	}
+
+	if (page_is_leaf(page)) {
+
+		goto func_exit;
+	}
+
+	tuple = dict_index_build_node_ptr(
+		index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap,
+		btr_page_get_level(page));
+
+	/* For spatial index, the MBR in the parent rec could be different
+	with that of first rec of child, their relationship should be
+	"WITHIN" relationship */
+	if (dict_index_is_spatial(index)) {
+		ut_a(!cmp_dtuple_rec_with_gis(
+			tuple, btr_cur_get_rec(&cursor),
+			PAGE_CUR_WITHIN));
+	} else {
+		ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), offsets));
+	}
+func_exit:
+	mem_heap_free(heap);
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/************************************************************//**
+Display identification information for a record. */
+static
+void
+btr_index_rec_validate_report(
+/*==========================*/
+	const page_t*		page,	/*!< in: index page */
+	const rec_t*		rec,	/*!< in: index record */
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ib::info() << "Record in index " << index->name
+		<< " of table " << index->table->name
+		<< ", page " << page_id_t(page_get_space_id(page),
+					  page_get_page_no(page))
+		<< ", at offset " << page_offset(rec);
+}
+
+/************************************************************//**
+Checks the size and number of fields in a record based on the definition of
+the index.
+@return TRUE if ok */
+ibool
+btr_index_rec_validate(
+/*===================*/
+	const rec_t*		rec,		/*!< in: index record */
+	const dict_index_t*	index,		/*!< in: index */
+	ibool			dump_on_error)	/*!< in: TRUE if the function
+						should print hex dump of record
+						and page on error */
+{
+	ulint		len;
+	const page_t*	page;
+	mem_heap_t*	heap	= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets	= offsets_;
+	rec_offs_init(offsets_);
+
+	page = page_align(rec);
+
+	ut_ad(index->n_core_fields);
+
+	if (index->is_ibuf()) {
+		/* The insert buffer index tree can contain records from any
+		other index: we cannot check the number of fields or
+		their length */
+
+		return(TRUE);
+	}
+
+#ifdef VIRTUAL_INDEX_DEBUG
+	if (dict_index_has_virtual(index)) {
+		fprintf(stderr, "index name is %s\n", index->name());
+	}
+#endif
+	if ((ibool)!!page_is_comp(page) != dict_table_is_comp(index->table)) {
+		btr_index_rec_validate_report(page, rec, index);
+
+		ib::error() << "Compact flag=" << !!page_is_comp(page)
+			<< ", should be " << dict_table_is_comp(index->table);
+
+		return(FALSE);
+	}
+
+	const bool is_alter_metadata = page_is_leaf(page)
+		&& !page_has_prev(page)
+		&& index->is_primary() && index->table->instant
+		&& rec == page_rec_get_next_const(page_get_infimum_rec(page));
+
+	if (is_alter_metadata
+	    && !rec_is_alter_metadata(rec, page_is_comp(page))) {
+		btr_index_rec_validate_report(page, rec, index);
+
+		ib::error() << "First record is not ALTER TABLE metadata";
+		return FALSE;
+	}
+
+	if (!page_is_comp(page)) {
+		const ulint n_rec_fields = rec_get_n_fields_old(rec);
+		if (n_rec_fields == DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD
+		    && index->id == DICT_INDEXES_ID) {
+			/* A record for older SYS_INDEXES table
+			(missing merge_threshold column) is acceptable. */
+		} else if (is_alter_metadata) {
+			if (n_rec_fields != ulint(index->n_fields) + 1) {
+				goto n_field_mismatch;
+			}
+		} else if (n_rec_fields < index->n_core_fields
+			   || n_rec_fields > index->n_fields) {
+n_field_mismatch:
+			btr_index_rec_validate_report(page, rec, index);
+
+			ib::error() << "Has " << rec_get_n_fields_old(rec)
+				    << " fields, should have "
+				    << index->n_core_fields << ".."
+				    << index->n_fields;
+
+			if (dump_on_error) {
+				fputs("InnoDB: corrupt record ", stderr);
+				rec_print_old(stderr, rec);
+				putc('\n', stderr);
+			}
+			return(FALSE);
+		}
+	}
+
+	offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page)
+				  ? index->n_core_fields : 0,
+				  ULINT_UNDEFINED, &heap);
+	const dict_field_t* field = index->fields;
+	ut_ad(rec_offs_n_fields(offsets)
+	      == ulint(index->n_fields) + is_alter_metadata);
+
+	for (unsigned i = 0; i < rec_offs_n_fields(offsets); i++) {
+		rec_get_nth_field_offs(offsets, i, &len);
+
+		ulint fixed_size;
+
+		if (is_alter_metadata && i == index->first_user_field()) {
+			fixed_size = FIELD_REF_SIZE;
+			if (len != FIELD_REF_SIZE
+			    || !rec_offs_nth_extern(offsets, i)) {
+				goto len_mismatch;
+			}
+
+			continue;
+		} else {
+			fixed_size = dict_col_get_fixed_size(
+				field->col, page_is_comp(page));
+			if (rec_offs_nth_extern(offsets, i)) {
+				const byte* data = rec_get_nth_field(
+					rec, offsets, i, &len);
+				len -= BTR_EXTERN_FIELD_REF_SIZE;
+				ulint extern_len = mach_read_from_4(
+					data + len + BTR_EXTERN_LEN + 4);
+				if (fixed_size == extern_len) {
+					goto next_field;
+				}
+			}
+		}
+
+		/* Note that if fixed_size != 0, it equals the
+		length of a fixed-size column in the clustered index.
+		We should adjust it here.
+		A prefix index of the column is of fixed, but different
+		length.  When fixed_size == 0, prefix_len is the maximum
+		length of the prefix index column. */
+
+		if (len_is_stored(len)
+		    && (field->prefix_len
+			? len > field->prefix_len
+			: (fixed_size && len != fixed_size))) {
+len_mismatch:
+			btr_index_rec_validate_report(page, rec, index);
+			ib::error	error;
+
+			error << "Field " << i << " len is " << len
+				<< ", should be " << fixed_size;
+
+			if (dump_on_error) {
+				error << "; ";
+				rec_print(error.m_oss, rec,
+					  rec_get_info_bits(
+						  rec, rec_offs_comp(offsets)),
+					  offsets);
+			}
+			if (heap) {
+				mem_heap_free(heap);
+			}
+			return(FALSE);
+		}
+next_field:
+		field++;
+	}
+
+#ifdef VIRTUAL_INDEX_DEBUG
+	if (dict_index_has_virtual(index)) {
+		rec_print_new(stderr, rec, offsets);
+	}
+#endif
+
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	return(TRUE);
+}
+
+/************************************************************//**
+Checks the size and number of fields in records based on the definition of
+the index.
+@return TRUE if ok */
+static
+ibool
+btr_index_page_validate(
+/*====================*/
+	buf_block_t*	block,	/*!< in: index page */
+	dict_index_t*	index)	/*!< in: index */
+{
+	page_cur_t	cur;
+	ibool		ret	= TRUE;
+#ifndef DBUG_OFF
+	ulint		nth	= 1;
+#endif /* !DBUG_OFF */
+
+	page_cur_set_before_first(block, &cur);
+
+	/* Directory slot 0 should only contain the infimum record. */
+	DBUG_EXECUTE_IF("check_table_rec_next",
+			ut_a(page_rec_get_nth_const(
+				     page_cur_get_page(&cur), 0)
+			     == cur.rec);
+			ut_a(page_dir_slot_get_n_owned(
+				     page_dir_get_nth_slot(
+					     page_cur_get_page(&cur), 0))
+			     == 1););
+
+	page_cur_move_to_next(&cur);
+
+	for (;;) {
+		if (page_cur_is_after_last(&cur)) {
+
+			break;
+		}
+
+		if (!btr_index_rec_validate(cur.rec, index, TRUE)) {
+
+			return(FALSE);
+		}
+
+		/* Verify that page_rec_get_nth_const() is correctly
+		retrieving each record. */
+		DBUG_EXECUTE_IF("check_table_rec_next",
+				ut_a(cur.rec == page_rec_get_nth_const(
+					     page_cur_get_page(&cur),
+					     page_rec_get_n_recs_before(
+						     cur.rec)));
+				ut_a(nth++ == page_rec_get_n_recs_before(
+					     cur.rec)););
+
+		page_cur_move_to_next(&cur);
+	}
+
+	return(ret);
+}
+
+/************************************************************//**
+Report an error on one page of an index tree. */
+static
+void
+btr_validate_report1(
+/*=================*/
+	dict_index_t*		index,	/*!< in: index */
+	ulint			level,	/*!< in: B-tree level */
+	const buf_block_t*	block)	/*!< in: index page */
+{
+	ib::error	error;
+	error << "In page " << block->page.id().page_no()
+		<< " of index " << index->name
+		<< " of table " << index->table->name;
+
+	if (level > 0) {
+		error << ", index tree level " << level;
+	}
+}
+
+/************************************************************//**
+Report an error on two pages of an index tree. */
+static
+void
+btr_validate_report2(
+/*=================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			level,	/*!< in: B-tree level */
+	const buf_block_t*	block1,	/*!< in: first index page */
+	const buf_block_t*	block2)	/*!< in: second index page */
+{
+  ib::error error;
+  error << "In pages " << block1->page.id()
+	<< " and " << block2->page.id() << " of index " << index->name
+	<< " of table " << index->table->name;
+
+  if (level)
+    error << ", index tree level " << level;
+}
+
+/************************************************************//**
+Validates index tree level.
+@return TRUE if ok */
+static
+bool
+btr_validate_level(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index tree */
+	const trx_t*	trx,	/*!< in: transaction or NULL */
+	ulint		level,	/*!< in: level number */
+	bool		lockout)/*!< in: true if X-latch index is intended */
+{
+	buf_block_t*	block;
+	page_t*		page;
+	buf_block_t*	right_block = 0; /* remove warning */
+	page_t*		right_page = 0; /* remove warning */
+	page_t*		father_page;
+	btr_cur_t	node_cur;
+	btr_cur_t	right_node_cur;
+	rec_t*		rec;
+	page_cur_t	cursor;
+	dtuple_t*	node_ptr_tuple;
+	bool		ret	= true;
+	mtr_t		mtr;
+	mem_heap_t*	heap	= mem_heap_create(256);
+	rec_offs*	offsets	= NULL;
+	rec_offs*	offsets2= NULL;
+#ifdef UNIV_ZIP_DEBUG
+	page_zip_des_t*	page_zip;
+#endif /* UNIV_ZIP_DEBUG */
+	ulint		savepoint = 0;
+	ulint		savepoint2 = 0;
+	uint32_t	parent_page_no = FIL_NULL;
+	uint32_t	parent_right_page_no = FIL_NULL;
+	bool		rightmost_child = false;
+
+	mtr.start();
+
+	if (!srv_read_only_mode) {
+		if (lockout) {
+			mtr_x_lock_index(index, &mtr);
+		} else {
+			mtr_sx_lock_index(index, &mtr);
+		}
+	}
+
+	block = btr_root_block_get(index, RW_SX_LATCH, &mtr);
+	page = buf_block_get_frame(block);
+
+	fil_space_t*		space	= index->table->space;
+
+	while (level != btr_page_get_level(page)) {
+		const rec_t*	node_ptr;
+
+		if (fseg_page_is_free(space, block->page.id().page_no())) {
+
+			btr_validate_report1(index, level, block);
+
+			ib::warn() << "Page is free";
+
+			ret = false;
+		}
+
+		ut_a(index->table->space_id == block->page.id().space());
+		ut_a(block->page.id().space() == page_get_space_id(page));
+#ifdef UNIV_ZIP_DEBUG
+		page_zip = buf_block_get_page_zip(block);
+		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+		ut_a(!page_is_leaf(page));
+
+		page_cur_set_before_first(block, &cursor);
+		page_cur_move_to_next(&cursor);
+
+		node_ptr = page_cur_get_rec(&cursor);
+		offsets = rec_get_offsets(node_ptr, index, offsets, 0,
+					  ULINT_UNDEFINED, &heap);
+
+		savepoint2 = mtr_set_savepoint(&mtr);
+		block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr);
+		page = buf_block_get_frame(block);
+
+		/* For R-Tree, since record order might not be the same as
+		linked index page in the lower level, we need to travers
+		backwards to get the first page rec in this level.
+		This is only used for index validation. Spatial index
+		does not use such scan for any of its DML or query
+		operations  */
+		if (dict_index_is_spatial(index)) {
+			uint32_t left_page_no = btr_page_get_prev(page);
+
+			while (left_page_no != FIL_NULL) {
+				/* To obey latch order of tree blocks,
+				we should release the right_block once to
+				obtain lock of the uncle block. */
+				mtr_release_block_at_savepoint(
+					&mtr, savepoint2, block);
+
+				savepoint2 = mtr_set_savepoint(&mtr);
+				block = btr_block_get(*index, left_page_no,
+						      RW_SX_LATCH, false,
+						      &mtr);
+				page = buf_block_get_frame(block);
+				left_page_no = btr_page_get_prev(page);
+			}
+		}
+	}
+
+	/* Now we are on the desired level. Loop through the pages on that
+	level. */
+
+loop:
+	mem_heap_empty(heap);
+	offsets = offsets2 = NULL;
+	if (!srv_read_only_mode) {
+		if (lockout) {
+			mtr_x_lock_index(index, &mtr);
+		} else {
+			mtr_sx_lock_index(index, &mtr);
+		}
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	page_zip = buf_block_get_page_zip(block);
+	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	ut_a(block->page.id().space() == index->table->space_id);
+
+	if (fseg_page_is_free(space, block->page.id().page_no())) {
+
+		btr_validate_report1(index, level, block);
+
+		ib::warn() << "Page is marked as free";
+		ret = false;
+
+	} else if (btr_page_get_index_id(page) != index->id) {
+
+		ib::error() << "Page index id " << btr_page_get_index_id(page)
+			<< " != data dictionary index id " << index->id;
+
+		ret = false;
+
+	} else if (!page_validate(page, index)) {
+
+		btr_validate_report1(index, level, block);
+		ret = false;
+
+	} else if (level == 0 && !btr_index_page_validate(block, index)) {
+
+		/* We are on level 0. Check that the records have the right
+		number of fields, and field lengths are right. */
+
+		ret = false;
+	}
+
+	ut_a(btr_page_get_level(page) == level);
+
+	uint32_t right_page_no = btr_page_get_next(page);
+	uint32_t left_page_no = btr_page_get_prev(page);
+
+	ut_a(!page_is_empty(page)
+	     || (level == 0
+		 && page_get_page_no(page) == dict_index_get_page(index)));
+
+	if (right_page_no != FIL_NULL) {
+		const rec_t*	right_rec;
+		savepoint = mtr_set_savepoint(&mtr);
+
+		right_block = btr_block_get(*index, right_page_no, RW_SX_LATCH,
+					    !level, &mtr);
+		right_page = buf_block_get_frame(right_block);
+
+		if (btr_page_get_prev(right_page) != page_get_page_no(page)) {
+			btr_validate_report2(index, level, block, right_block);
+			fputs("InnoDB: broken FIL_PAGE_NEXT"
+			      " or FIL_PAGE_PREV links\n", stderr);
+
+			ret = false;
+		}
+
+		if (page_is_comp(right_page) != page_is_comp(page)) {
+			btr_validate_report2(index, level, block, right_block);
+			fputs("InnoDB: 'compact' flag mismatch\n", stderr);
+
+			ret = false;
+
+			goto node_ptr_fails;
+		}
+
+		rec = page_rec_get_prev(page_get_supremum_rec(page));
+		right_rec = page_rec_get_next(page_get_infimum_rec(
+						      right_page));
+		offsets = rec_get_offsets(rec, index, offsets,
+					  page_is_leaf(page)
+					  ? index->n_core_fields : 0,
+					  ULINT_UNDEFINED, &heap);
+		offsets2 = rec_get_offsets(right_rec, index, offsets2,
+					   page_is_leaf(right_page)
+					   ? index->n_core_fields : 0,
+					   ULINT_UNDEFINED, &heap);
+
+		/* For spatial index, we cannot guarantee the key ordering
+		across pages, so skip the record compare verification for
+		now. Will enhanced in special R-Tree index validation scheme */
+		if (!dict_index_is_spatial(index)
+		    && cmp_rec_rec(rec, right_rec,
+				   offsets, offsets2, index) >= 0) {
+
+			btr_validate_report2(index, level, block, right_block);
+
+			fputs("InnoDB: records in wrong order"
+			      " on adjacent pages\n", stderr);
+
+			fputs("InnoDB: record ", stderr);
+			rec = page_rec_get_prev(page_get_supremum_rec(page));
+			rec_print(stderr, rec, index);
+			putc('\n', stderr);
+			fputs("InnoDB: record ", stderr);
+			rec = page_rec_get_next(
+				page_get_infimum_rec(right_page));
+			rec_print(stderr, rec, index);
+			putc('\n', stderr);
+
+			ret = false;
+		}
+	}
+
+	if (level > 0 && left_page_no == FIL_NULL) {
+		ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+			     page_rec_get_next(page_get_infimum_rec(page)),
+			     page_is_comp(page)));
+	}
+
+	/* Similarly skip the father node check for spatial index for now,
+	for a couple of reasons:
+	1) As mentioned, there is no ordering relationship between records
+	in parent level and linked pages in the child level.
+	2) Search parent from root is very costly for R-tree.
+	We will add special validation mechanism for R-tree later (WL #7520) */
+	if (!dict_index_is_spatial(index)
+	    && block->page.id().page_no() != dict_index_get_page(index)) {
+
+		/* Check father node pointers */
+		rec_t*	node_ptr;
+
+		btr_cur_position(
+			index, page_rec_get_next(page_get_infimum_rec(page)),
+			block, &node_cur);
+		offsets = btr_page_get_father_node_ptr_for_validate(
+			offsets, heap, &node_cur, &mtr);
+
+		father_page = btr_cur_get_page(&node_cur);
+		node_ptr = btr_cur_get_rec(&node_cur);
+
+		parent_page_no = page_get_page_no(father_page);
+		parent_right_page_no = btr_page_get_next(father_page);
+		rightmost_child = page_rec_is_supremum(
+					page_rec_get_next(node_ptr));
+
+		btr_cur_position(
+			index,
+			page_rec_get_prev(page_get_supremum_rec(page)),
+			block, &node_cur);
+
+		offsets = btr_page_get_father_node_ptr_for_validate(
+				offsets, heap, &node_cur, &mtr);
+
+		if (node_ptr != btr_cur_get_rec(&node_cur)
+		    || btr_node_ptr_get_child_page_no(node_ptr, offsets)
+		    != block->page.id().page_no()) {
+
+			btr_validate_report1(index, level, block);
+
+			fputs("InnoDB: node pointer to the page is wrong\n",
+			      stderr);
+
+			fputs("InnoDB: node ptr ", stderr);
+			rec_print(stderr, node_ptr, index);
+
+			rec = btr_cur_get_rec(&node_cur);
+			fprintf(stderr, "\n"
+				"InnoDB: node ptr child page n:o %u\n",
+				btr_node_ptr_get_child_page_no(rec, offsets));
+
+			fputs("InnoDB: record on page ", stderr);
+			rec_print_new(stderr, rec, offsets);
+			putc('\n', stderr);
+			ret = false;
+
+			goto node_ptr_fails;
+		}
+
+		if (!page_is_leaf(page)) {
+			node_ptr_tuple = dict_index_build_node_ptr(
+				index,
+				page_rec_get_next(page_get_infimum_rec(page)),
+				0, heap, btr_page_get_level(page));
+
+			if (cmp_dtuple_rec(node_ptr_tuple, node_ptr,
+					   offsets)) {
+				const rec_t* first_rec = page_rec_get_next(
+					page_get_infimum_rec(page));
+
+				btr_validate_report1(index, level, block);
+
+				ib::error() << "Node ptrs differ on levels > 0";
+
+				fputs("InnoDB: node ptr ",stderr);
+				rec_print_new(stderr, node_ptr, offsets);
+				fputs("InnoDB: first rec ", stderr);
+				rec_print(stderr, first_rec, index);
+				putc('\n', stderr);
+				ret = false;
+
+				goto node_ptr_fails;
+			}
+		}
+
+		if (left_page_no == FIL_NULL) {
+			ut_a(node_ptr == page_rec_get_next(
+				     page_get_infimum_rec(father_page)));
+			ut_a(!page_has_prev(father_page));
+		}
+
+		if (right_page_no == FIL_NULL) {
+			ut_a(node_ptr == page_rec_get_prev(
+				     page_get_supremum_rec(father_page)));
+			ut_a(!page_has_next(father_page));
+		} else {
+			const rec_t*	right_node_ptr;
+
+			right_node_ptr = page_rec_get_next(node_ptr);
+
+			if (!lockout && rightmost_child) {
+
+				/* To obey latch order of tree blocks,
+				we should release the right_block once to
+				obtain lock of the uncle block. */
+				mtr_release_block_at_savepoint(
+					&mtr, savepoint, right_block);
+
+				if (parent_right_page_no != FIL_NULL) {
+					btr_block_get(*index,
+						      parent_right_page_no,
+						      RW_SX_LATCH, false,
+						      &mtr);
+				}
+
+				right_block = btr_block_get(*index,
+							    right_page_no,
+							    RW_SX_LATCH,
+							    !level, &mtr);
+			}
+
+			btr_cur_position(
+				index, page_rec_get_next(
+					page_get_infimum_rec(
+						buf_block_get_frame(
+							right_block))),
+				right_block, &right_node_cur);
+
+			offsets = btr_page_get_father_node_ptr_for_validate(
+					offsets, heap, &right_node_cur, &mtr);
+
+			if (right_node_ptr
+			    != page_get_supremum_rec(father_page)) {
+
+				if (btr_cur_get_rec(&right_node_cur)
+				    != right_node_ptr) {
+					ret = false;
+					fputs("InnoDB: node pointer to"
+					      " the right page is wrong\n",
+					      stderr);
+
+					btr_validate_report1(index, level,
+							     block);
+				}
+			} else {
+				page_t*	right_father_page
+					= btr_cur_get_page(&right_node_cur);
+
+				if (btr_cur_get_rec(&right_node_cur)
+				    != page_rec_get_next(
+					    page_get_infimum_rec(
+						    right_father_page))) {
+					ret = false;
+					fputs("InnoDB: node pointer 2 to"
+					      " the right page is wrong\n",
+					      stderr);
+
+					btr_validate_report1(index, level,
+							     block);
+				}
+
+				if (page_get_page_no(right_father_page)
+				    != btr_page_get_next(father_page)) {
+
+					ret = false;
+					fputs("InnoDB: node pointer 3 to"
+					      " the right page is wrong\n",
+					      stderr);
+
+					btr_validate_report1(index, level,
+							     block);
+				}
+			}
+		}
+	}
+
+node_ptr_fails:
+	/* Commit the mini-transaction to release the latch on 'page'.
+	Re-acquire the latch on right_page, which will become 'page'
+	on the next loop.  The page has already been checked. */
+	mtr.commit();
+
+	if (trx_is_interrupted(trx)) {
+		/* On interrupt, return the current status. */
+	} else if (right_page_no != FIL_NULL) {
+
+		mtr.start();
+
+		if (!lockout) {
+			if (rightmost_child) {
+				if (parent_right_page_no != FIL_NULL) {
+					btr_block_get(*index,
+						      parent_right_page_no,
+						      RW_SX_LATCH, false,
+						      &mtr);
+				}
+			} else if (parent_page_no != FIL_NULL) {
+				btr_block_get(*index, parent_page_no,
+					      RW_SX_LATCH, false, &mtr);
+			}
+		}
+
+		block = btr_block_get(*index, right_page_no, RW_SX_LATCH,
+				      !level, &mtr);
+		page = buf_block_get_frame(block);
+
+		goto loop;
+	}
+
+	mem_heap_free(heap);
+
+	return(ret);
+}
+
+/**************************************************************//**
+Checks the consistency of an index tree.
+@return	DB_SUCCESS if ok, error code if not */
+dberr_t
+btr_validate_index(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index */
+	const trx_t*	trx)	/*!< in: transaction or NULL */
+{
+	dberr_t err = DB_SUCCESS;
+	bool lockout = dict_index_is_spatial(index);
+
+	/* Full Text index are implemented by auxiliary tables,
+	not the B-tree */
+	if (dict_index_is_online_ddl(index) || (index->type & DICT_FTS)) {
+		return(err);
+	}
+
+	mtr_t		mtr;
+
+	mtr_start(&mtr);
+
+	if (!srv_read_only_mode) {
+		if (lockout) {
+			mtr_x_lock_index(index, &mtr);
+		} else {
+			mtr_sx_lock_index(index, &mtr);
+		}
+	}
+
+	page_t*	root = btr_root_get(index, &mtr);
+
+	if (!root) {
+		mtr_commit(&mtr);
+		return DB_CORRUPTION;
+	}
+
+	ulint	n = btr_page_get_level(root);
+
+	btr_validate_index_running++;
+	for (ulint i = 0; i <= n; ++i) {
+
+		if (!btr_validate_level(index, trx, n - i, lockout)) {
+			err = DB_CORRUPTION;
+		}
+	}
+
+	mtr_commit(&mtr);
+	/* In theory we need release barrier here, so that
+	btr_validate_index_running decrement is guaranteed to
+	happen after latches are released.
+
+	Original code issued SEQ_CST on update and non-atomic
+	access on load. Which means it had broken synchronisation
+	as well. */
+	btr_validate_index_running--;
+
+	return(err);
+}
+
+/**************************************************************//**
+Checks if the page in the cursor can be merged with given page.
+If necessary, re-organize the merge_page.
+@return	true if possible to merge. */
+static
+bool
+btr_can_merge_with_page(
+/*====================*/
+	btr_cur_t*	cursor,		/*!< in: cursor on the page to merge */
+	uint32_t	page_no,	/*!< in: a sibling page */
+	buf_block_t**	merge_block,	/*!< out: the merge block */
+	mtr_t*		mtr)		/*!< in: mini-transaction */
+{
+	dict_index_t*	index;
+	page_t*		page;
+	ulint		n_recs;
+	ulint		data_size;
+	ulint		max_ins_size_reorg;
+	ulint		max_ins_size;
+	buf_block_t*	mblock;
+	page_t*		mpage;
+	DBUG_ENTER("btr_can_merge_with_page");
+
+	if (page_no == FIL_NULL) {
+		*merge_block = NULL;
+		DBUG_RETURN(false);
+	}
+
+	index = btr_cur_get_index(cursor);
+	page = btr_cur_get_page(cursor);
+
+	mblock = btr_block_get(*index, page_no, RW_X_LATCH, page_is_leaf(page),
+			       mtr);
+	mpage = buf_block_get_frame(mblock);
+
+	n_recs = page_get_n_recs(page);
+	data_size = page_get_data_size(page);
+
+	max_ins_size_reorg = page_get_max_insert_size_after_reorganize(
+		mpage, n_recs);
+
+	if (data_size > max_ins_size_reorg) {
+		goto error;
+	}
+
+	/* If compression padding tells us that merging will result in
+	too packed up page i.e.: which is likely to cause compression
+	failure then don't merge the pages. */
+	if (mblock->page.zip.data && page_is_leaf(mpage)
+	    && (page_get_data_size(mpage) + data_size
+		>= dict_index_zip_pad_optimal_page_size(index))) {
+
+		goto error;
+	}
+
+	max_ins_size = page_get_max_insert_size(mpage, n_recs);
+
+	if (data_size > max_ins_size) {
+		/* We have to reorganize mpage */
+		if (!btr_page_reorganize_block(page_zip_level, mblock, index,
+					       mtr)) {
+			goto error;
+		}
+
+		max_ins_size = page_get_max_insert_size(mpage, n_recs);
+
+		ut_ad(page_validate(mpage, index));
+		ut_ad(max_ins_size == max_ins_size_reorg);
+
+		if (data_size > max_ins_size) {
+
+			/* Add fault tolerance, though this should
+			never happen */
+
+			goto error;
+		}
+	}
+
+	*merge_block = mblock;
+	DBUG_RETURN(true);
+
+error:
+	*merge_block = NULL;
+	DBUG_RETURN(false);
+}
diff --git a/storage/innobase/btr/btr0bulk.cc b/storage/innobase/btr/btr0bulk.cc
new file mode 100644
index 00000000..9004064a
--- /dev/null
+++ b/storage/innobase/btr/btr0bulk.cc
@@ -0,0 +1,1238 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0bulk.cc
+The B-tree bulk load
+
+Created 03/11/2014 Shaohua Wang
+*******************************************************/
+
+#include "btr0bulk.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0pcur.h"
+#include "ibuf0ibuf.h"
+#include "page0page.h"
+#include "trx0trx.h"
+
+/** Innodb B-tree index fill factor for bulk load. */
+uint	innobase_fill_factor;
+
+/** Initialize members, allocate page if needed and start mtr.
+Note: we commit all mtrs on failure.
+@return error code. */
+dberr_t
+PageBulk::init()
+{
+	buf_block_t*	new_block;
+	page_t*		new_page;
+
+	ut_ad(m_heap == NULL);
+	m_heap = mem_heap_create(1000);
+
+	m_mtr.start();
+	m_index->set_modified(m_mtr);
+
+	if (m_page_no == FIL_NULL) {
+		mtr_t	alloc_mtr;
+
+		/* We commit redo log for allocation by a separate mtr,
+		because we don't guarantee pages are committed following
+		the allocation order, and we will always generate redo log
+		for page allocation, even when creating a new tablespace. */
+		alloc_mtr.start();
+		m_index->set_modified(alloc_mtr);
+
+		uint32_t n_reserved;
+		if (!fsp_reserve_free_extents(&n_reserved,
+					      m_index->table->space,
+					      1, FSP_NORMAL, &alloc_mtr)) {
+			alloc_mtr.commit();
+			m_mtr.commit();
+			return(DB_OUT_OF_FILE_SPACE);
+		}
+
+		/* Allocate a new page. */
+		new_block = btr_page_alloc(m_index, 0, FSP_UP, m_level,
+					   &alloc_mtr, &m_mtr);
+
+		m_index->table->space->release_free_extents(n_reserved);
+
+		alloc_mtr.commit();
+
+		new_page = buf_block_get_frame(new_block);
+		m_page_no = new_block->page.id().page_no();
+
+		byte* index_id = my_assume_aligned<2>
+			(PAGE_HEADER + PAGE_INDEX_ID + new_page);
+		compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4);
+		compile_time_assert(FIL_NULL == 0xffffffff);
+		memset_aligned<8>(new_page + FIL_PAGE_PREV, 0xff, 8);
+
+		if (UNIV_LIKELY_NULL(new_block->page.zip.data)) {
+			mach_write_to_8(index_id, m_index->id);
+			page_create_zip(new_block, m_index, m_level, 0,
+					&m_mtr);
+		} else {
+			ut_ad(!m_index->is_spatial());
+			page_create(new_block, &m_mtr,
+				    m_index->table->not_redundant());
+			m_mtr.memset(*new_block, FIL_PAGE_PREV, 8, 0xff);
+			m_mtr.write<2,mtr_t::MAYBE_NOP>(*new_block, PAGE_HEADER
+							+ PAGE_LEVEL
+							+ new_page, m_level);
+			m_mtr.write<8>(*new_block, index_id, m_index->id);
+		}
+	} else {
+		new_block = btr_block_get(*m_index, m_page_no, RW_X_LATCH,
+					  false, &m_mtr);
+
+		new_page = buf_block_get_frame(new_block);
+		ut_ad(new_block->page.id().page_no() == m_page_no);
+
+		ut_ad(page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW);
+
+		btr_page_set_level(new_block, m_level, &m_mtr);
+	}
+
+	m_page_zip = buf_block_get_page_zip(new_block);
+
+	if (!m_level && dict_index_is_sec_or_ibuf(m_index)) {
+		page_update_max_trx_id(new_block, m_page_zip, m_trx_id,
+				       &m_mtr);
+	}
+
+	m_block = new_block;
+	m_page = new_page;
+	m_cur_rec = page_get_infimum_rec(new_page);
+	ut_ad(m_is_comp == !!page_is_comp(new_page));
+	m_free_space = page_get_free_space_of_empty(m_is_comp);
+
+	if (innobase_fill_factor == 100 && dict_index_is_clust(m_index)) {
+		/* Keep default behavior compatible with 5.6 */
+		m_reserved_space = dict_index_get_space_reserve();
+	} else {
+		m_reserved_space =
+			srv_page_size * (100 - innobase_fill_factor) / 100;
+	}
+
+	m_padding_space =
+		srv_page_size - dict_index_zip_pad_optimal_page_size(m_index);
+	m_heap_top = page_header_get_ptr(new_page, PAGE_HEAP_TOP);
+	m_rec_no = page_header_get_field(new_page, PAGE_N_RECS);
+	/* Temporarily reset PAGE_DIRECTION_B from PAGE_NO_DIRECTION to 0,
+	without writing redo log, to ensure that needs_finish() will hold
+	on an empty page. */
+	ut_ad(m_page[PAGE_HEADER + PAGE_DIRECTION_B] == PAGE_NO_DIRECTION);
+	m_page[PAGE_HEADER + PAGE_DIRECTION_B] = 0;
+	ut_d(m_total_data = 0);
+
+	return(DB_SUCCESS);
+}
+
+/** Insert a record in the page.
+@tparam fmt     the page format
+@param[in,out]	rec		record
+@param[in]	offsets		record offsets */
+template<PageBulk::format fmt>
+inline void PageBulk::insertPage(rec_t *rec, rec_offs *offsets)
+{
+  ut_ad((m_page_zip != nullptr) == (fmt == COMPRESSED));
+  ut_ad((fmt != REDUNDANT) == m_is_comp);
+  ut_ad(page_align(m_heap_top) == m_page);
+  ut_ad(m_heap);
+
+  const ulint rec_size= rec_offs_size(offsets);
+  const ulint extra_size= rec_offs_extra_size(offsets);
+  ut_ad(page_align(m_heap_top + rec_size) == m_page);
+  ut_d(const bool is_leaf= page_rec_is_leaf(m_cur_rec));
+
+#ifdef UNIV_DEBUG
+  /* Check whether records are in order. */
+  if (page_offset(m_cur_rec) !=
+      (fmt == REDUNDANT ? PAGE_OLD_INFIMUM : PAGE_NEW_INFIMUM))
+  {
+    const rec_t *old_rec = m_cur_rec;
+    rec_offs *old_offsets= rec_get_offsets(old_rec, m_index, nullptr, is_leaf
+                                           ? m_index->n_core_fields : 0,
+                                           ULINT_UNDEFINED, &m_heap);
+    ut_ad(cmp_rec_rec(rec, old_rec, offsets, old_offsets, m_index) > 0);
+  }
+
+  m_total_data+= rec_size;
+#endif /* UNIV_DEBUG */
+
+  rec_t* const insert_rec= m_heap_top + extra_size;
+
+  /* Insert the record in the linked list. */
+  if (fmt != REDUNDANT)
+  {
+    const rec_t *next_rec= m_page +
+      page_offset(m_cur_rec + mach_read_from_2(m_cur_rec - REC_NEXT));
+    if (fmt != COMPRESSED)
+      m_mtr.write<2>(*m_block, m_cur_rec - REC_NEXT,
+                     static_cast<uint16_t>(insert_rec - m_cur_rec));
+    else
+    {
+      mach_write_to_2(m_cur_rec - REC_NEXT,
+                      static_cast<uint16_t>(insert_rec - m_cur_rec));
+      memcpy(m_heap_top, rec - extra_size, rec_size);
+    }
+
+    rec_t * const this_rec= fmt != COMPRESSED
+      ? const_cast<rec_t*>(rec) : insert_rec;
+    rec_set_bit_field_1(this_rec, 0, REC_NEW_N_OWNED, REC_N_OWNED_MASK,
+                        REC_N_OWNED_SHIFT);
+    rec_set_bit_field_2(this_rec, PAGE_HEAP_NO_USER_LOW + m_rec_no,
+                        REC_NEW_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+    mach_write_to_2(this_rec - REC_NEXT,
+                    static_cast<uint16_t>(next_rec - insert_rec));
+  }
+  else
+  {
+    memcpy(const_cast<rec_t*>(rec) - REC_NEXT, m_cur_rec - REC_NEXT, 2);
+    m_mtr.write<2>(*m_block, m_cur_rec - REC_NEXT, page_offset(insert_rec));
+    rec_set_bit_field_1(const_cast<rec_t*>(rec), 0,
+                        REC_OLD_N_OWNED, REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+    rec_set_bit_field_2(const_cast<rec_t*>(rec),
+                        PAGE_HEAP_NO_USER_LOW + m_rec_no,
+                        REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+  }
+
+  if (fmt == COMPRESSED)
+    /* We already wrote the record. Log is written in PageBulk::compress(). */;
+  else if (page_offset(m_cur_rec) ==
+           (fmt == REDUNDANT ? PAGE_OLD_INFIMUM : PAGE_NEW_INFIMUM))
+    m_mtr.memcpy(*m_block, m_heap_top, rec - extra_size, rec_size);
+  else
+  {
+    /* Try to copy common prefix from the preceding record. */
+    const byte *r= rec - extra_size;
+    const byte * const insert_rec_end= m_heap_top + rec_size;
+    byte *b= m_heap_top;
+
+    /* Skip any unchanged prefix of the record. */
+    for (; * b == *r; b++, r++);
+
+    ut_ad(b < insert_rec_end);
+
+    const byte *c= m_cur_rec - (rec - r);
+    const byte * const c_end= std::min(m_cur_rec + rec_offs_data_size(offsets),
+                                       m_heap_top);
+
+    /* Try to copy any bytes of the preceding record. */
+    if (UNIV_LIKELY(c >= m_page && c < c_end))
+    {
+      const byte *cm= c;
+      byte *bm= b;
+      const byte *rm= r;
+      for (; cm < c_end && *rm == *cm; cm++, bm++, rm++);
+      ut_ad(bm <= insert_rec_end);
+      size_t len= static_cast<size_t>(rm - r);
+      ut_ad(!memcmp(r, c, len));
+      if (len > 2)
+      {
+        memcpy(b, c, len);
+        m_mtr.memmove(*m_block, page_offset(b), page_offset(c), len);
+        c= cm;
+        b= bm;
+        r= rm;
+      }
+    }
+
+    if (c < m_cur_rec)
+    {
+      if (!rec_offs_data_size(offsets))
+      {
+no_data:
+        m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, m_cur_rec - c);
+        goto rec_done;
+      }
+      /* Some header bytes differ. Compare the data separately. */
+      const byte *cd= m_cur_rec;
+      byte *bd= insert_rec;
+      const byte *rd= rec;
+      /* Skip any unchanged prefix of the record. */
+      for (;; cd++, bd++, rd++)
+        if (bd == insert_rec_end)
+          goto no_data;
+        else if (*bd != *rd)
+          break;
+
+      /* Try to copy any data bytes of the preceding record. */
+      if (c_end - cd > 2)
+      {
+        const byte *cdm= cd;
+        const byte *rdm= rd;
+        for (; cdm < c_end && *rdm == *cdm; cdm++, rdm++)
+        ut_ad(rdm - rd + bd <= insert_rec_end);
+        size_t len= static_cast<size_t>(rdm - rd);
+        ut_ad(!memcmp(rd, cd, len));
+        if (len > 2)
+        {
+          m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, m_cur_rec - c);
+          memcpy(bd, cd, len);
+          m_mtr.memmove(*m_block, page_offset(bd), page_offset(cd), len);
+          c= cdm;
+          b= rdm - rd + bd;
+          r= rdm;
+        }
+      }
+    }
+
+    if (size_t len= static_cast<size_t>(insert_rec_end - b))
+      m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, len);
+  }
+
+rec_done:
+  ut_ad(fmt == COMPRESSED || !memcmp(m_heap_top, rec - extra_size, rec_size));
+  rec_offs_make_valid(insert_rec, m_index, is_leaf, offsets);
+
+  /* Update the member variables. */
+  ulint slot_size= page_dir_calc_reserved_space(m_rec_no + 1) -
+    page_dir_calc_reserved_space(m_rec_no);
+
+  ut_ad(m_free_space >= rec_size + slot_size);
+  ut_ad(m_heap_top + rec_size < m_page + srv_page_size);
+
+  m_free_space-= rec_size + slot_size;
+  m_heap_top+= rec_size;
+  m_rec_no++;
+  m_cur_rec= insert_rec;
+}
+
+/** Insert a record in the page.
+@param[in]	rec		record
+@param[in]	offsets		record offsets */
+inline void PageBulk::insert(const rec_t *rec, rec_offs *offsets)
+{
+  byte rec_hdr[REC_N_OLD_EXTRA_BYTES];
+  static_assert(REC_N_OLD_EXTRA_BYTES > REC_N_NEW_EXTRA_BYTES, "file format");
+
+  if (UNIV_LIKELY_NULL(m_page_zip))
+    insertPage<COMPRESSED>(const_cast<rec_t*>(rec), offsets);
+  else if (m_is_comp)
+  {
+    memcpy(rec_hdr, rec - REC_N_NEW_EXTRA_BYTES, REC_N_NEW_EXTRA_BYTES);
+    insertPage<DYNAMIC>(const_cast<rec_t*>(rec), offsets);
+    memcpy(const_cast<rec_t*>(rec) - REC_N_NEW_EXTRA_BYTES, rec_hdr,
+           REC_N_NEW_EXTRA_BYTES);
+  }
+  else
+  {
+    memcpy(rec_hdr, rec - REC_N_OLD_EXTRA_BYTES, REC_N_OLD_EXTRA_BYTES);
+    insertPage<REDUNDANT>(const_cast<rec_t*>(rec), offsets);
+    memcpy(const_cast<rec_t*>(rec) - REC_N_OLD_EXTRA_BYTES, rec_hdr,
+           REC_N_OLD_EXTRA_BYTES);
+  }
+}
+
+/** Set the number of owned records in the uncompressed page of
+a ROW_FORMAT=COMPRESSED record without redo-logging. */
+static void rec_set_n_owned_zip(rec_t *rec, ulint n_owned)
+{
+  rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED,
+                      REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+}
+
+/** Mark end of insertion to the page. Scan all records to set page dirs,
+and set page header members.
+@tparam fmt  page format */
+template<PageBulk::format fmt>
+inline void PageBulk::finishPage()
+{
+  ut_ad((m_page_zip != nullptr) == (fmt == COMPRESSED));
+  ut_ad((fmt != REDUNDANT) == m_is_comp);
+
+  ulint count= 0;
+  ulint n_recs= 0;
+  byte *slot= my_assume_aligned<2>(m_page + srv_page_size -
+                                   (PAGE_DIR + PAGE_DIR_SLOT_SIZE));
+  const page_dir_slot_t *const slot0 = slot;
+  compile_time_assert(PAGE_DIR_SLOT_SIZE == 2);
+  if (fmt != REDUNDANT)
+  {
+    uint16_t offset= mach_read_from_2(PAGE_NEW_INFIMUM - REC_NEXT + m_page);
+    ut_ad(offset >= PAGE_NEW_SUPREMUM - PAGE_NEW_INFIMUM);
+    offset= static_cast<uint16_t>(offset + PAGE_NEW_INFIMUM);
+    /* Set owner & dir. */
+    while (offset != PAGE_NEW_SUPREMUM)
+    {
+      ut_ad(offset >= PAGE_NEW_SUPREMUM);
+      ut_ad(offset < page_offset(slot));
+      count++;
+      n_recs++;
+
+      if (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2)
+      {
+        slot-= PAGE_DIR_SLOT_SIZE;
+        mach_write_to_2(slot, offset);
+
+        if (fmt != COMPRESSED)
+          page_rec_set_n_owned<false>(m_block, m_page + offset, count, true,
+                                      &m_mtr);
+        else
+          rec_set_n_owned_zip(m_page + offset, count);
+
+        count= 0;
+      }
+
+      uint16_t next= static_cast<uint16_t>
+        ((mach_read_from_2(m_page + offset - REC_NEXT) + offset) &
+         (srv_page_size - 1));
+      ut_ad(next);
+      offset= next;
+    }
+
+    if (slot0 != slot && (count + 1 + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 <=
+                          PAGE_DIR_SLOT_MAX_N_OWNED))
+    {
+      /* Merge the last two slots, like page_cur_insert_rec_low() does. */
+      count+= (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2;
+
+      rec_t *rec= const_cast<rec_t*>(page_dir_slot_get_rec(slot));
+      if (fmt != COMPRESSED)
+        page_rec_set_n_owned<false>(m_block, rec, 0, true, &m_mtr);
+      else
+        rec_set_n_owned_zip(rec, 0);
+    }
+    else
+      slot-= PAGE_DIR_SLOT_SIZE;
+
+    mach_write_to_2(slot, PAGE_NEW_SUPREMUM);
+    if (fmt != COMPRESSED)
+      page_rec_set_n_owned<false>(m_block, m_page + PAGE_NEW_SUPREMUM,
+                                  count + 1, true, &m_mtr);
+    else
+      rec_set_n_owned_zip(m_page + PAGE_NEW_SUPREMUM, count + 1);
+  }
+  else
+  {
+    rec_t *insert_rec= m_page +
+      mach_read_from_2(PAGE_OLD_INFIMUM - REC_NEXT + m_page);
+
+    /* Set owner & dir. */
+    while (insert_rec != m_page + PAGE_OLD_SUPREMUM)
+    {
+      count++;
+      n_recs++;
+
+      if (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2)
+      {
+        slot-= PAGE_DIR_SLOT_SIZE;
+        mach_write_to_2(slot, page_offset(insert_rec));
+        page_rec_set_n_owned<false>(m_block, insert_rec, count, false, &m_mtr);
+        count= 0;
+      }
+
+      insert_rec= m_page + mach_read_from_2(insert_rec - REC_NEXT);
+    }
+
+    if (slot0 != slot && (count + 1 + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 <=
+                          PAGE_DIR_SLOT_MAX_N_OWNED))
+    {
+      /* Merge the last two slots, like page_cur_insert_rec_low() does. */
+      count+= (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2;
+
+      rec_t *rec= const_cast<rec_t*>(page_dir_slot_get_rec(slot));
+      page_rec_set_n_owned<false>(m_block, rec, 0, false, &m_mtr);
+    }
+    else
+      slot-= PAGE_DIR_SLOT_SIZE;
+
+    mach_write_to_2(slot, PAGE_OLD_SUPREMUM);
+    page_rec_set_n_owned<false>(m_block, m_page + PAGE_OLD_SUPREMUM, count + 1,
+                                false, &m_mtr);
+  }
+
+  if (!m_rec_no);
+  else if (fmt != COMPRESSED)
+  {
+    static_assert(PAGE_N_DIR_SLOTS == 0, "compatibility");
+    alignas(8) byte page_header[PAGE_N_HEAP + 2];
+    mach_write_to_2(page_header + PAGE_N_DIR_SLOTS,
+                    1 + (slot0 - slot) / PAGE_DIR_SLOT_SIZE);
+    mach_write_to_2(page_header + PAGE_HEAP_TOP, m_heap_top - m_page);
+    mach_write_to_2(page_header + PAGE_N_HEAP,
+                    (PAGE_HEAP_NO_USER_LOW + m_rec_no) |
+                    uint16_t{fmt != REDUNDANT} << 15);
+    m_mtr.memcpy(*m_block, PAGE_HEADER + m_page, page_header,
+                 sizeof page_header);
+    m_mtr.write<2>(*m_block, PAGE_HEADER + PAGE_N_RECS + m_page, m_rec_no);
+    m_mtr.memcpy(*m_block, page_offset(slot), slot0 - slot);
+  }
+  else
+  {
+    /* For ROW_FORMAT=COMPRESSED, redo log may be written in
+    PageBulk::compress(). */
+    mach_write_to_2(PAGE_HEADER + PAGE_N_DIR_SLOTS + m_page,
+                    1 + (slot0 - slot) / PAGE_DIR_SLOT_SIZE);
+    mach_write_to_2(PAGE_HEADER + PAGE_HEAP_TOP + m_page,
+                    static_cast<ulint>(m_heap_top - m_page));
+    mach_write_to_2(PAGE_HEADER + PAGE_N_HEAP + m_page,
+                    (PAGE_HEAP_NO_USER_LOW + m_rec_no) | 1U << 15);
+    mach_write_to_2(PAGE_HEADER + PAGE_N_RECS + m_page, m_rec_no);
+  }
+}
+
+inline bool PageBulk::needs_finish() const
+{
+  ut_ad(page_align(m_cur_rec) == m_block->frame);
+  ut_ad(m_page == m_block->frame);
+  if (!m_page[PAGE_HEADER + PAGE_DIRECTION_B])
+    return true;
+  ulint heap_no, n_heap= page_header_get_field(m_page, PAGE_N_HEAP);
+  ut_ad((n_heap & 0x7fff) >= PAGE_HEAP_NO_USER_LOW);
+  if (n_heap & 0x8000)
+  {
+    n_heap&= 0x7fff;
+    heap_no= rec_get_heap_no_new(m_cur_rec);
+    if (heap_no == PAGE_HEAP_NO_INFIMUM &&
+	page_header_get_field(m_page, PAGE_HEAP_TOP) == PAGE_NEW_SUPREMUM_END)
+      return false;
+  }
+  else
+  {
+    heap_no= rec_get_heap_no_old(m_cur_rec);
+    if (heap_no == PAGE_HEAP_NO_INFIMUM &&
+	page_header_get_field(m_page, PAGE_HEAP_TOP) == PAGE_OLD_SUPREMUM_END)
+      return false;
+  }
+  return heap_no != n_heap - 1;
+}
+
+/** Mark end of insertion to the page. Scan all records to set page dirs,
+and set page header members.
+@tparam compressed  whether the page is in ROW_FORMAT=COMPRESSED */
+inline void PageBulk::finish()
+{
+  ut_ad(!m_index->is_spatial());
+
+  if (!needs_finish());
+  else if (UNIV_LIKELY_NULL(m_page_zip))
+    finishPage<COMPRESSED>();
+  else if (m_is_comp)
+    finishPage<DYNAMIC>();
+  else
+    finishPage<REDUNDANT>();
+
+  /* In MariaDB 10.2, 10.3, 10.4, we would initialize
+  PAGE_DIRECTION_B, PAGE_N_DIRECTION, PAGE_LAST_INSERT
+  in the same way as we would during normal INSERT operations.
+  Starting with MariaDB Server 10.5, bulk insert will not
+  touch those fields. */
+  ut_ad(!m_page[PAGE_HEADER + PAGE_INSTANT]);
+  /* Restore the temporary change of PageBulk::init() that was necessary to
+  ensure that PageBulk::needs_finish() holds on an empty page. */
+  m_page[PAGE_HEADER + PAGE_DIRECTION_B]= PAGE_NO_DIRECTION;
+
+  ut_ad(!page_header_get_field(m_page, PAGE_FREE));
+  ut_ad(!page_header_get_field(m_page, PAGE_GARBAGE));
+  ut_ad(!page_header_get_field(m_page, PAGE_LAST_INSERT));
+  ut_ad(!page_header_get_field(m_page, PAGE_N_DIRECTION));
+  ut_ad(m_total_data + page_dir_calc_reserved_space(m_rec_no) <=
+        page_get_free_space_of_empty(m_is_comp));
+  ut_ad(!needs_finish());
+  ut_ad(page_validate(m_page, m_index));
+}
+
+/** Commit inserts done to the page
+@param[in]	success		Flag whether all inserts succeed. */
+void PageBulk::commit(bool success)
+{
+  finish();
+  if (success && !dict_index_is_clust(m_index) && page_is_leaf(m_page))
+    ibuf_set_bitmap_for_bulk_load(m_block, innobase_fill_factor == 100);
+  m_mtr.commit();
+}
+
+/** Compress a page of compressed table
+@return	true	compress successfully or no need to compress
+@return	false	compress failed. */
+bool
+PageBulk::compress()
+{
+	ut_ad(m_page_zip != NULL);
+
+	return page_zip_compress(m_block, m_index, page_zip_level, &m_mtr);
+}
+
+/** Get node pointer
+@return node pointer */
+dtuple_t*
+PageBulk::getNodePtr()
+{
+	rec_t*		first_rec;
+	dtuple_t*	node_ptr;
+
+	/* Create node pointer */
+	first_rec = page_rec_get_next(page_get_infimum_rec(m_page));
+	ut_a(page_rec_is_user_rec(first_rec));
+	node_ptr = dict_index_build_node_ptr(m_index, first_rec, m_page_no,
+					     m_heap, m_level);
+
+	return(node_ptr);
+}
+
+/** Get split rec in left page.We split a page in half when compresssion fails,
+and the split rec will be copied to right page.
+@return split rec */
+rec_t*
+PageBulk::getSplitRec()
+{
+	rec_t*		rec;
+	rec_offs*	offsets;
+	ulint		total_used_size;
+	ulint		total_recs_size;
+	ulint		n_recs;
+
+	ut_ad(m_page_zip != NULL);
+	ut_ad(m_rec_no >= 2);
+	ut_ad(!m_index->is_instant());
+
+	ut_ad(page_get_free_space_of_empty(m_is_comp) > m_free_space);
+	total_used_size = page_get_free_space_of_empty(m_is_comp)
+		- m_free_space;
+
+	total_recs_size = 0;
+	n_recs = 0;
+	offsets = NULL;
+	rec = page_get_infimum_rec(m_page);
+	const ulint n_core = page_is_leaf(m_page) ? m_index->n_core_fields : 0;
+
+	do {
+		rec = page_rec_get_next(rec);
+		ut_ad(page_rec_is_user_rec(rec));
+
+		offsets = rec_get_offsets(rec, m_index, offsets, n_core,
+					  ULINT_UNDEFINED, &m_heap);
+		total_recs_size += rec_offs_size(offsets);
+		n_recs++;
+	} while (total_recs_size + page_dir_calc_reserved_space(n_recs)
+		 < total_used_size / 2);
+
+	/* Keep at least one record on left page */
+	if (page_rec_is_infimum(page_rec_get_prev(rec))) {
+		rec = page_rec_get_next(rec);
+		ut_ad(page_rec_is_user_rec(rec));
+	}
+
+	return(rec);
+}
+
+/** Copy all records after split rec including itself.
+@param[in]	rec	split rec */
+void
+PageBulk::copyIn(
+	rec_t*		split_rec)
+{
+
+	rec_t*		rec = split_rec;
+	rec_offs*	offsets = NULL;
+
+	ut_ad(m_rec_no == 0);
+	ut_ad(page_rec_is_user_rec(rec));
+
+	const ulint n_core = page_rec_is_leaf(rec)
+		? m_index->n_core_fields : 0;
+
+	do {
+		offsets = rec_get_offsets(rec, m_index, offsets, n_core,
+					  ULINT_UNDEFINED, &m_heap);
+
+		insert(rec, offsets);
+
+		rec = page_rec_get_next(rec);
+	} while (!page_rec_is_supremum(rec));
+
+	ut_ad(m_rec_no > 0);
+}
+
+/** Remove all records after split rec including itself.
+@param[in]	rec	split rec	*/
+void
+PageBulk::copyOut(
+	rec_t*		split_rec)
+{
+	rec_t*		rec;
+	rec_t*		last_rec;
+	ulint		n;
+
+	/* Suppose before copyOut, we have 5 records on the page:
+	infimum->r1->r2->r3->r4->r5->supremum, and r3 is the split rec.
+
+	after copyOut, we have 2 records on the page:
+	infimum->r1->r2->supremum. slot ajustment is not done. */
+
+	rec = page_rec_get_next(page_get_infimum_rec(m_page));
+	last_rec = page_rec_get_prev(page_get_supremum_rec(m_page));
+	n = 0;
+
+	while (rec != split_rec) {
+		rec = page_rec_get_next(rec);
+		n++;
+	}
+
+	ut_ad(n > 0);
+
+	/* Set last record's next in page */
+	rec_offs*	offsets = NULL;
+	rec = page_rec_get_prev(split_rec);
+	const ulint n_core = page_rec_is_leaf(split_rec)
+		? m_index->n_core_fields : 0;
+
+	offsets = rec_get_offsets(rec, m_index, offsets, n_core,
+				  ULINT_UNDEFINED, &m_heap);
+	mach_write_to_2(rec - REC_NEXT, m_is_comp
+			? static_cast<uint16_t>
+			(PAGE_NEW_SUPREMUM - page_offset(rec))
+			: PAGE_OLD_SUPREMUM);
+
+	/* Set related members */
+	m_cur_rec = rec;
+	m_heap_top = rec_get_end(rec, offsets);
+
+	offsets = rec_get_offsets(last_rec, m_index, offsets, n_core,
+				  ULINT_UNDEFINED, &m_heap);
+
+	m_free_space += ulint(rec_get_end(last_rec, offsets) - m_heap_top)
+		+ page_dir_calc_reserved_space(m_rec_no)
+		- page_dir_calc_reserved_space(n);
+	ut_ad(lint(m_free_space) > 0);
+	m_rec_no = n;
+
+#ifdef UNIV_DEBUG
+	m_total_data -= ulint(rec_get_end(last_rec, offsets) - m_heap_top);
+#endif /* UNIV_DEBUG */
+}
+
+/** Set next page
+@param[in]	next_page_no	next page no */
+inline void PageBulk::setNext(ulint next_page_no)
+{
+  if (UNIV_LIKELY_NULL(m_page_zip))
+    /* For ROW_FORMAT=COMPRESSED, redo log may be written
+    in PageBulk::compress(). */
+    mach_write_to_4(m_page + FIL_PAGE_NEXT, next_page_no);
+  else
+    m_mtr.write<4>(*m_block, m_page + FIL_PAGE_NEXT, next_page_no);
+}
+
+/** Set previous page
+@param[in]	prev_page_no	previous page no */
+inline void PageBulk::setPrev(ulint prev_page_no)
+{
+  if (UNIV_LIKELY_NULL(m_page_zip))
+    /* For ROW_FORMAT=COMPRESSED, redo log may be written
+    in PageBulk::compress(). */
+    mach_write_to_4(m_page + FIL_PAGE_PREV, prev_page_no);
+  else
+    m_mtr.write<4>(*m_block, m_page + FIL_PAGE_PREV, prev_page_no);
+}
+
+/** Check if required space is available in the page for the rec to be inserted.
+We check fill factor & padding here.
+@param[in]	length		required length
+@return true	if space is available */
+bool
+PageBulk::isSpaceAvailable(
+	ulint		rec_size)
+{
+	ulint	slot_size;
+	ulint	required_space;
+
+	slot_size = page_dir_calc_reserved_space(m_rec_no + 1)
+		- page_dir_calc_reserved_space(m_rec_no);
+
+	required_space = rec_size + slot_size;
+
+	if (required_space > m_free_space) {
+		ut_ad(m_rec_no > 0);
+		return false;
+	}
+
+	/* Fillfactor & Padding apply to both leaf and non-leaf pages.
+	Note: we keep at least 2 records in a page to avoid B-tree level
+	growing too high. */
+	if (m_rec_no >= 2
+	    && ((m_page_zip == NULL && m_free_space - required_space
+		 < m_reserved_space)
+		|| (m_page_zip != NULL && m_free_space - required_space
+		    < m_padding_space))) {
+		return(false);
+	}
+
+	return(true);
+}
+
+/** Check whether the record needs to be stored externally.
+@return false if the entire record can be stored locally on the page  */
+bool
+PageBulk::needExt(
+	const dtuple_t*		tuple,
+	ulint			rec_size)
+{
+	return page_zip_rec_needs_ext(rec_size, m_is_comp,
+				      dtuple_get_n_fields(tuple),
+				      m_block->zip_size());
+}
+
+/** Store external record
+Since the record is not logged yet, so we don't log update to the record.
+the blob data is logged first, then the record is logged in bulk mode.
+@param[in]	big_rec		external recrod
+@param[in]	offsets		record offsets
+@return	error code */
+dberr_t
+PageBulk::storeExt(
+	const big_rec_t*	big_rec,
+	rec_offs*		offsets)
+{
+	finish();
+
+	/* Note: not all fields are initialized in btr_pcur. */
+	btr_pcur_t	btr_pcur;
+	btr_pcur.pos_state = BTR_PCUR_IS_POSITIONED;
+	btr_pcur.latch_mode = BTR_MODIFY_LEAF;
+	btr_pcur.btr_cur.index = m_index;
+	btr_pcur.btr_cur.page_cur.index = m_index;
+	btr_pcur.btr_cur.page_cur.rec = m_cur_rec;
+	btr_pcur.btr_cur.page_cur.offsets = offsets;
+	btr_pcur.btr_cur.page_cur.block = m_block;
+
+	dberr_t	err = btr_store_big_rec_extern_fields(
+		&btr_pcur, offsets, big_rec, &m_mtr, BTR_STORE_INSERT_BULK);
+
+	/* Reset m_block and m_cur_rec from page cursor, because
+	block may be changed during blob insert. (FIXME: Can it really?) */
+	ut_ad(m_block == btr_pcur.btr_cur.page_cur.block);
+
+	m_block = btr_pcur.btr_cur.page_cur.block;
+	m_cur_rec = btr_pcur.btr_cur.page_cur.rec;
+	m_page = buf_block_get_frame(m_block);
+
+	return(err);
+}
+
+/** Release block by commiting mtr
+Note: log_free_check requires holding no lock/latch in current thread. */
+void
+PageBulk::release()
+{
+	finish();
+
+	/* We fix the block because we will re-pin it soon. */
+	buf_block_buf_fix_inc(m_block, __FILE__, __LINE__);
+
+	/* No other threads can modify this block. */
+	m_modify_clock = buf_block_get_modify_clock(m_block);
+
+	m_mtr.commit();
+}
+
+/** Start mtr and latch the block */
+dberr_t
+PageBulk::latch()
+{
+	m_mtr.start();
+	m_index->set_modified(m_mtr);
+
+	ut_ad(m_block->page.buf_fix_count());
+
+	/* In case the block is S-latched by page_cleaner. */
+	if (!buf_page_optimistic_get(RW_X_LATCH, m_block, m_modify_clock,
+				     __FILE__, __LINE__, &m_mtr)) {
+		m_block = buf_page_get_gen(page_id_t(m_index->table->space_id,
+						     m_page_no),
+					   0, RW_X_LATCH,
+					   m_block, BUF_GET_IF_IN_POOL,
+					   __FILE__, __LINE__, &m_mtr, &m_err);
+
+		if (m_err != DB_SUCCESS) {
+			return (m_err);
+		}
+
+		ut_ad(m_block != NULL);
+	}
+
+	buf_block_buf_fix_dec(m_block);
+
+	ut_ad(m_block->page.buf_fix_count());
+
+	ut_ad(m_cur_rec > m_page && m_cur_rec < m_heap_top);
+
+	return (m_err);
+}
+
+/** Split a page
+@param[in]	page_bulk	page to split
+@param[in]	next_page_bulk	next page
+@return	error code */
+dberr_t
+BtrBulk::pageSplit(
+	PageBulk*	page_bulk,
+	PageBulk*	next_page_bulk)
+{
+	ut_ad(page_bulk->getPageZip() != NULL);
+
+	if (page_bulk->getRecNo() <= 1) {
+		return(DB_TOO_BIG_RECORD);
+	}
+
+	/* Initialize a new page */
+	PageBulk new_page_bulk(m_index, m_trx->id, FIL_NULL,
+			       page_bulk->getLevel());
+	dberr_t	err = new_page_bulk.init();
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	/* Copy the upper half to the new page. */
+	rec_t*	split_rec = page_bulk->getSplitRec();
+	new_page_bulk.copyIn(split_rec);
+	page_bulk->copyOut(split_rec);
+
+	/* Commit the pages after split. */
+	err = pageCommit(page_bulk, &new_page_bulk, true);
+	if (err != DB_SUCCESS) {
+		pageAbort(&new_page_bulk);
+		return(err);
+	}
+
+	err = pageCommit(&new_page_bulk, next_page_bulk, true);
+	if (err != DB_SUCCESS) {
+		pageAbort(&new_page_bulk);
+		return(err);
+	}
+
+	return(err);
+}
+
+/** Commit(finish) a page. We set next/prev page no, compress a page of
+compressed table and split the page if compression fails, insert a node
+pointer to father page if needed, and commit mini-transaction.
+@param[in]	page_bulk	page to commit
+@param[in]	next_page_bulk	next page
+@param[in]	insert_father	false when page_bulk is a root page and
+				true when it's a non-root page
+@return	error code */
+dberr_t
+BtrBulk::pageCommit(
+	PageBulk*	page_bulk,
+	PageBulk*	next_page_bulk,
+	bool		insert_father)
+{
+	page_bulk->finish();
+
+	/* Set page links */
+	if (next_page_bulk != NULL) {
+		ut_ad(page_bulk->getLevel() == next_page_bulk->getLevel());
+
+		page_bulk->setNext(next_page_bulk->getPageNo());
+		next_page_bulk->setPrev(page_bulk->getPageNo());
+	} else {
+		ut_ad(!page_has_next(page_bulk->getPage()));
+		/* If a page is released and latched again, we need to
+		mark it modified in mini-transaction.  */
+		page_bulk->set_modified();
+	}
+
+	ut_ad(!rw_lock_own_flagged(&m_index->lock,
+				   RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX
+				   | RW_LOCK_FLAG_S));
+
+	/* Compress page if it's a compressed table. */
+	if (page_bulk->getPageZip() != NULL && !page_bulk->compress()) {
+		return(pageSplit(page_bulk, next_page_bulk));
+	}
+
+	/* Insert node pointer to father page. */
+	if (insert_father) {
+		dtuple_t*	node_ptr = page_bulk->getNodePtr();
+		dberr_t		err = insert(node_ptr, page_bulk->getLevel()+1);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	}
+
+	/* Commit mtr. */
+	page_bulk->commit(true);
+
+	return(DB_SUCCESS);
+}
+
+/** Log free check */
+inline void BtrBulk::logFreeCheck()
+{
+	if (log_sys.check_flush_or_checkpoint()) {
+		release();
+
+		log_check_margins();
+
+		latch();
+	}
+}
+
+/** Release all latches */
+void
+BtrBulk::release()
+{
+	ut_ad(m_root_level + 1 == m_page_bulks.size());
+
+	for (ulint level = 0; level <= m_root_level; level++) {
+		PageBulk*    page_bulk = m_page_bulks.at(level);
+
+		page_bulk->release();
+	}
+}
+
+/** Re-latch all latches */
+void
+BtrBulk::latch()
+{
+	ut_ad(m_root_level + 1 == m_page_bulks.size());
+
+	for (ulint level = 0; level <= m_root_level; level++) {
+		PageBulk*    page_bulk = m_page_bulks.at(level);
+		page_bulk->latch();
+	}
+}
+
+/** Insert a tuple to page in a level
+@param[in]	tuple	tuple to insert
+@param[in]	level	B-tree level
+@return error code */
+dberr_t
+BtrBulk::insert(
+	dtuple_t*	tuple,
+	ulint		level)
+{
+	bool		is_left_most = false;
+	dberr_t		err = DB_SUCCESS;
+
+	/* Check if we need to create a PageBulk for the level. */
+	if (level + 1 > m_page_bulks.size()) {
+		PageBulk*	new_page_bulk
+			= UT_NEW_NOKEY(PageBulk(m_index, m_trx->id, FIL_NULL,
+						level));
+		err = new_page_bulk->init();
+		if (err != DB_SUCCESS) {
+			UT_DELETE(new_page_bulk);
+			return(err);
+		}
+
+		m_page_bulks.push_back(new_page_bulk);
+		ut_ad(level + 1 == m_page_bulks.size());
+		m_root_level = level;
+
+		is_left_most = true;
+	}
+
+	ut_ad(m_page_bulks.size() > level);
+
+	PageBulk*	page_bulk = m_page_bulks.at(level);
+
+	if (is_left_most && level > 0 && page_bulk->getRecNo() == 0) {
+		/* The node pointer must be marked as the predefined minimum
+		record,	as there is no lower alphabetical limit to records in
+		the leftmost node of a level: */
+		dtuple_set_info_bits(tuple, dtuple_get_info_bits(tuple)
+					    | REC_INFO_MIN_REC_FLAG);
+	}
+
+	ulint		n_ext = 0;
+	ulint		rec_size = rec_get_converted_size(m_index, tuple, n_ext);
+	big_rec_t*	big_rec = NULL;
+	rec_t*		rec = NULL;
+	rec_offs*	offsets = NULL;
+
+	if (page_bulk->needExt(tuple, rec_size)) {
+		/* The record is so big that we have to store some fields
+		externally on separate database pages */
+		big_rec = dtuple_convert_big_rec(m_index, 0, tuple, &n_ext);
+
+		if (big_rec == NULL) {
+			return(DB_TOO_BIG_RECORD);
+		}
+
+		rec_size = rec_get_converted_size(m_index, tuple, n_ext);
+	}
+
+	if (page_bulk->getPageZip() != NULL
+	    && page_zip_is_too_big(m_index, tuple)) {
+		err = DB_TOO_BIG_RECORD;
+		goto func_exit;
+	}
+
+	if (!page_bulk->isSpaceAvailable(rec_size)) {
+		/* Create a sibling page_bulk. */
+		PageBulk*	sibling_page_bulk;
+		sibling_page_bulk = UT_NEW_NOKEY(PageBulk(m_index, m_trx->id,
+							  FIL_NULL, level));
+		err = sibling_page_bulk->init();
+		if (err != DB_SUCCESS) {
+			UT_DELETE(sibling_page_bulk);
+			goto func_exit;
+		}
+
+		/* Commit page bulk. */
+		err = pageCommit(page_bulk, sibling_page_bulk, true);
+		if (err != DB_SUCCESS) {
+			pageAbort(sibling_page_bulk);
+			UT_DELETE(sibling_page_bulk);
+			goto func_exit;
+		}
+
+		/* Set new page bulk to page_bulks. */
+		ut_ad(sibling_page_bulk->getLevel() <= m_root_level);
+		m_page_bulks.at(level) = sibling_page_bulk;
+
+		UT_DELETE(page_bulk);
+		page_bulk = sibling_page_bulk;
+
+		/* Important: log_free_check whether we need a checkpoint. */
+		if (page_is_leaf(sibling_page_bulk->getPage())) {
+			if (trx_is_interrupted(m_trx)) {
+				err = DB_INTERRUPTED;
+				goto func_exit;
+			}
+
+			srv_inc_activity_count();
+			logFreeCheck();
+		}
+	}
+
+	/* Convert tuple to rec. */
+        rec = rec_convert_dtuple_to_rec(static_cast<byte*>(mem_heap_alloc(
+		page_bulk->m_heap, rec_size)), m_index, tuple, n_ext);
+        offsets = rec_get_offsets(rec, m_index, offsets, level
+				  ? 0 : m_index->n_core_fields,
+				  ULINT_UNDEFINED, &page_bulk->m_heap);
+
+	page_bulk->insert(rec, offsets);
+
+	if (big_rec != NULL) {
+		ut_ad(dict_index_is_clust(m_index));
+		ut_ad(page_bulk->getLevel() == 0);
+		ut_ad(page_bulk == m_page_bulks.at(0));
+
+		/* Release all pages above the leaf level */
+		for (ulint level = 1; level <= m_root_level; level++) {
+			m_page_bulks.at(level)->release();
+		}
+
+		err = page_bulk->storeExt(big_rec, offsets);
+
+		/* Latch */
+		for (ulint level = 1; level <= m_root_level; level++) {
+			PageBulk*    page_bulk = m_page_bulks.at(level);
+			page_bulk->latch();
+		}
+	}
+
+func_exit:
+	if (big_rec != NULL) {
+		dtuple_convert_back_big_rec(m_index, tuple, big_rec);
+	}
+
+	return(err);
+}
+
+/** Btree bulk load finish. We commit the last page in each level
+and copy the last page in top level to the root page of the index
+if no error occurs.
+@param[in]	err	whether bulk load was successful until now
+@return error code  */
+dberr_t
+BtrBulk::finish(dberr_t	err)
+{
+	uint32_t last_page_no = FIL_NULL;
+
+	ut_ad(!m_index->table->is_temporary());
+
+	if (m_page_bulks.size() == 0) {
+		/* The table is empty. The root page of the index tree
+		is already in a consistent state. No need to flush. */
+		return(err);
+	}
+
+	ut_ad(m_root_level + 1 == m_page_bulks.size());
+
+	/* Finish all page bulks */
+	for (ulint level = 0; level <= m_root_level; level++) {
+		PageBulk*	page_bulk = m_page_bulks.at(level);
+
+		last_page_no = page_bulk->getPageNo();
+
+		if (err == DB_SUCCESS) {
+			err = pageCommit(page_bulk, NULL,
+					 level != m_root_level);
+		}
+
+		if (err != DB_SUCCESS) {
+			pageAbort(page_bulk);
+		}
+
+		UT_DELETE(page_bulk);
+	}
+
+	if (err == DB_SUCCESS) {
+		rec_t*		first_rec;
+		mtr_t		mtr;
+		buf_block_t*	last_block;
+		PageBulk	root_page_bulk(m_index, m_trx->id,
+					       m_index->page, m_root_level);
+
+		mtr.start();
+		m_index->set_modified(mtr);
+		mtr_x_lock_index(m_index, &mtr);
+
+		ut_ad(last_page_no != FIL_NULL);
+		last_block = btr_block_get(*m_index, last_page_no, RW_X_LATCH,
+					   false, &mtr);
+		first_rec = page_rec_get_next(
+			page_get_infimum_rec(last_block->frame));
+		ut_ad(page_rec_is_user_rec(first_rec));
+
+		/* Copy last page to root page. */
+		err = root_page_bulk.init();
+		if (err != DB_SUCCESS) {
+			mtr.commit();
+			return(err);
+		}
+		root_page_bulk.copyIn(first_rec);
+		root_page_bulk.finish();
+
+		/* Remove last page. */
+		btr_page_free(m_index, last_block, &mtr);
+
+		mtr.commit();
+
+		err = pageCommit(&root_page_bulk, NULL, false);
+		ut_ad(err == DB_SUCCESS);
+	}
+
+	ut_ad(!sync_check_iterate(dict_sync_check()));
+
+	ut_ad(err != DB_SUCCESS
+	      || btr_validate_index(m_index, NULL) == DB_SUCCESS);
+	return(err);
+}
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc
new file mode 100644
index 00000000..5bb2a0e2
--- /dev/null
+++ b/storage/innobase/btr/btr0cur.cc
@@ -0,0 +1,8279 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0cur.cc
+The index tree cursor
+
+All changes that row operations make to a B-tree or the records
+there must go through this module! Undo log records are written here
+of every modify or insert of a clustered index record.
+
+			NOTE!!!
+To make sure we do not run out of disk space during a pessimistic
+insert or update, we have to reserve 2 x the height of the index tree
+many pages in the tablespace before we start the operation, because
+if leaf splitting has been started, it is difficult to undo, except
+by crashing the database and doing a roll-forward.
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#include "btr0cur.h"
+#include "row0upd.h"
+#include "mtr0log.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "rem0rec.h"
+#include "rem0cmp.h"
+#include "buf0lru.h"
+#include "btr0btr.h"
+#include "btr0sea.h"
+#include "row0log.h"
+#include "row0purge.h"
+#include "row0upd.h"
+#include "trx0rec.h"
+#include "trx0roll.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "srv0srv.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "zlib.h"
+#include "srv0start.h"
+#include "mysql_com.h"
+#include "dict0stats.h"
+#ifdef WITH_WSREP
+#include "mysql/service_wsrep.h"
+#endif /* WITH_WSREP */
+
+/** Buffered B-tree operation types, introduced as part of delete buffering. */
+enum btr_op_t {
+	BTR_NO_OP = 0,			/*!< Not buffered */
+	BTR_INSERT_OP,			/*!< Insert, do not ignore UNIQUE */
+	BTR_INSERT_IGNORE_UNIQUE_OP,	/*!< Insert, ignoring UNIQUE */
+	BTR_DELETE_OP,			/*!< Purge a delete-marked record */
+	BTR_DELMARK_OP			/*!< Mark a record for deletion */
+};
+
+/** Modification types for the B-tree operation.
+    Note that the order must be DELETE, BOTH, INSERT !!
+ */
+enum btr_intention_t {
+	BTR_INTENTION_DELETE,
+	BTR_INTENTION_BOTH,
+	BTR_INTENTION_INSERT
+};
+
+/** For the index->lock scalability improvement, only possibility of clear
+performance regression observed was caused by grown huge history list length.
+That is because the exclusive use of index->lock also worked as reserving
+free blocks and read IO bandwidth with priority. To avoid huge glowing history
+list as same level with previous implementation, prioritizes pessimistic tree
+operations by purge as the previous, when it seems to be growing huge.
+
+ Experimentally, the history list length starts to affect to performance
+throughput clearly from about 100000. */
+#define BTR_CUR_FINE_HISTORY_LENGTH	100000
+
+/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
+Atomic_counter<ulint>	btr_cur_n_non_sea;
+/** Old value of btr_cur_n_non_sea.  Copied by
+srv_refresh_innodb_monitor_stats().  Referenced by
+srv_printf_innodb_monitor(). */
+ulint	btr_cur_n_non_sea_old;
+#ifdef BTR_CUR_HASH_ADAPT
+/** Number of successful adaptive hash index lookups in
+btr_cur_search_to_nth_level(). */
+ulint	btr_cur_n_sea;
+/** Old value of btr_cur_n_sea.  Copied by
+srv_refresh_innodb_monitor_stats().  Referenced by
+srv_printf_innodb_monitor(). */
+ulint	btr_cur_n_sea_old;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#ifdef UNIV_DEBUG
+/* Flag to limit optimistic insert records */
+uint	btr_cur_limit_optimistic_insert_debug;
+#endif /* UNIV_DEBUG */
+
+/** In the optimistic insert, if the insert does not fit, but this much space
+can be released by page reorganize, then it is reorganized */
+#define BTR_CUR_PAGE_REORGANIZE_LIMIT	(srv_page_size / 32)
+
+/** The structure of a BLOB part header */
+/* @{ */
+/*--------------------------------------*/
+#define BTR_BLOB_HDR_PART_LEN		0	/*!< BLOB part len on this
+						page */
+#define BTR_BLOB_HDR_NEXT_PAGE_NO	4	/*!< next BLOB part page no,
+						FIL_NULL if none */
+/*--------------------------------------*/
+#define BTR_BLOB_HDR_SIZE		8	/*!< Size of a BLOB
+						part header, in bytes */
+
+/** Estimated table level stats from sampled value.
+@param value sampled stats
+@param index index being sampled
+@param sample number of sampled rows
+@param ext_size external stored data size
+@param not_empty table not empty
+@return estimated table wide stats from sampled value */
+#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \
+	(((value) * static_cast<ib_uint64_t>(index->stat_n_leaf_pages) \
+	  + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
+
+/* @} */
+
+/*******************************************************************//**
+Marks all extern fields in a record as owned by the record. This function
+should be called if the delete mark of a record is removed: a not delete
+marked record always owns all its extern fields. */
+static
+void
+btr_cur_unmark_extern_fields(
+/*=========================*/
+	buf_block_t*	block,	/*!< in/out: index page */
+	rec_t*		rec,	/*!< in/out: record in a clustered index */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	mtr_t*		mtr);	/*!< in: mtr, or NULL if not logged */
+/*******************************************************************//**
+Adds path information to the cursor for the current page, for which
+the binary search has been performed. */
+static
+void
+btr_cur_add_path_info(
+/*==================*/
+	btr_cur_t*	cursor,		/*!< in: cursor positioned on a page */
+	ulint		height,		/*!< in: height of the page in tree;
+					0 means leaf node */
+	ulint		root_height);	/*!< in: root node height in tree */
+/***********************************************************//**
+Frees the externally stored fields for a record, if the field is mentioned
+in the update vector. */
+static
+void
+btr_rec_free_updated_extern_fields(
+/*===============================*/
+	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
+				X-latched */
+	rec_t*		rec,	/*!< in: record */
+	buf_block_t*	block,	/*!< in: index page of rec */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update,	/*!< in: update vector */
+	bool		rollback,/*!< in: performing rollback? */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
+				an X-latch to record page and to the tree */
+/***********************************************************//**
+Frees the externally stored fields for a record. */
+static
+void
+btr_rec_free_externally_stored_fields(
+/*==================================*/
+	dict_index_t*	index,	/*!< in: index of the data, the index
+				tree MUST be X-latched */
+	rec_t*		rec,	/*!< in: record */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	buf_block_t*	block,	/*!< in: index page of rec */
+	bool		rollback,/*!< in: performing rollback? */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
+				an X-latch to record page and to the index
+				tree */
+
+/*==================== B-TREE SEARCH =========================*/
+
+/** Latches the leaf page or pages requested.
+@param[in]	block		leaf page where the search converged
+@param[in]	latch_mode	BTR_SEARCH_LEAF, ...
+@param[in]	cursor		cursor
+@param[in]	mtr		mini-transaction
+@return	blocks and savepoints which actually latched. */
+btr_latch_leaves_t
+btr_cur_latch_leaves(
+	buf_block_t*		block,
+	ulint			latch_mode,
+	btr_cur_t*		cursor,
+	mtr_t*			mtr)
+{
+	rw_lock_type_t	mode;
+	uint32_t	left_page_no;
+	uint32_t	right_page_no;
+	buf_block_t*	get_block;
+	bool		spatial;
+	btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
+
+	compile_time_assert(int(MTR_MEMO_PAGE_S_FIX) == int(RW_S_LATCH));
+	compile_time_assert(int(MTR_MEMO_PAGE_X_FIX) == int(RW_X_LATCH));
+	compile_time_assert(int(MTR_MEMO_PAGE_SX_FIX) == int(RW_SX_LATCH));
+	ut_ad(block->page.id().space() == cursor->index->table->space->id);
+
+	spatial = dict_index_is_spatial(cursor->index) && cursor->rtr_info;
+	ut_ad(block->page.in_file());
+
+	switch (latch_mode) {
+	case BTR_SEARCH_LEAF:
+	case BTR_MODIFY_LEAF:
+	case BTR_SEARCH_TREE:
+		if (spatial) {
+			cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS]
+				= mtr_set_savepoint(mtr);
+		}
+
+		mode = latch_mode == BTR_MODIFY_LEAF ? RW_X_LATCH : RW_S_LATCH;
+		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
+		get_block = btr_block_get(*cursor->index,
+					  block->page.id().page_no(), mode,
+					  true, mtr);
+		latch_leaves.blocks[1] = get_block;
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(get_block->frame)
+		     == page_is_comp(block->frame));
+#endif /* UNIV_BTR_DEBUG */
+		if (spatial) {
+			cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
+				= get_block;
+		}
+
+		return(latch_leaves);
+	case BTR_MODIFY_TREE:
+		/* It is exclusive for other operations which calls
+		btr_page_set_prev() */
+		ut_ad(mtr->memo_contains_flagged(&cursor->index->lock,
+						 MTR_MEMO_X_LOCK
+						 | MTR_MEMO_SX_LOCK));
+		/* x-latch also siblings from left to right */
+		left_page_no = btr_page_get_prev(block->frame);
+
+		if (left_page_no != FIL_NULL) {
+
+			if (spatial) {
+				cursor->rtr_info->tree_savepoints[
+					RTR_MAX_LEVELS] = mtr_set_savepoint(mtr);
+			}
+
+			latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
+			get_block = btr_block_get(
+				*cursor->index, left_page_no, RW_X_LATCH,
+				true, mtr);
+			latch_leaves.blocks[0] = get_block;
+
+			if (spatial) {
+				cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
+					= get_block;
+			}
+		}
+
+		if (spatial) {
+			cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS + 1]
+				= mtr_set_savepoint(mtr);
+		}
+
+		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
+		get_block = btr_block_get(
+			*cursor->index, block->page.id().page_no(),
+			RW_X_LATCH, true, mtr);
+		latch_leaves.blocks[1] = get_block;
+
+#ifdef UNIV_BTR_DEBUG
+		/* Sanity check only after both the blocks are latched. */
+		if (latch_leaves.blocks[0] != NULL) {
+			ut_a(page_is_comp(latch_leaves.blocks[0]->frame)
+			     == page_is_comp(block->frame));
+			ut_a(btr_page_get_next(latch_leaves.blocks[0]->frame)
+			     == block->page.id().page_no());
+		}
+		ut_a(page_is_comp(get_block->frame)
+		     == page_is_comp(block->frame));
+#endif /* UNIV_BTR_DEBUG */
+
+		if (spatial) {
+			cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS + 1]
+				= get_block;
+		}
+
+		right_page_no = btr_page_get_next(block->frame);
+
+		if (right_page_no != FIL_NULL) {
+			if (spatial) {
+				cursor->rtr_info->tree_savepoints[
+					RTR_MAX_LEVELS + 2] = mtr_set_savepoint(
+								mtr);
+			}
+			latch_leaves.savepoints[2] = mtr_set_savepoint(mtr);
+			get_block = btr_block_get(*cursor->index,
+						  right_page_no, RW_X_LATCH,
+						  true, mtr);
+			latch_leaves.blocks[2] = get_block;
+#ifdef UNIV_BTR_DEBUG
+			ut_a(page_is_comp(get_block->frame)
+			     == page_is_comp(block->frame));
+			ut_a(btr_page_get_prev(get_block->frame)
+			     == block->page.id().page_no());
+#endif /* UNIV_BTR_DEBUG */
+			if (spatial) {
+				cursor->rtr_info->tree_blocks[
+					RTR_MAX_LEVELS + 2] = get_block;
+			}
+		}
+
+		return(latch_leaves);
+
+	case BTR_SEARCH_PREV:
+	case BTR_MODIFY_PREV:
+		mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
+		/* latch also left sibling */
+		rw_lock_s_lock(&block->lock);
+		left_page_no = btr_page_get_prev(block->frame);
+		rw_lock_s_unlock(&block->lock);
+
+		if (left_page_no != FIL_NULL) {
+			latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
+			get_block = btr_block_get(
+				*cursor->index, left_page_no, mode,
+				true, mtr);
+			latch_leaves.blocks[0] = get_block;
+			cursor->left_block = get_block;
+#ifdef UNIV_BTR_DEBUG
+			ut_a(page_is_comp(get_block->frame)
+			     == page_is_comp(block->frame));
+			ut_a(btr_page_get_next(get_block->frame)
+			     == block->page.id().page_no());
+#endif /* UNIV_BTR_DEBUG */
+		}
+
+		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
+		get_block = btr_block_get(*cursor->index,
+					  block->page.id().page_no(), mode,
+					  true, mtr);
+		latch_leaves.blocks[1] = get_block;
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(get_block->frame)
+		     == page_is_comp(block->frame));
+#endif /* UNIV_BTR_DEBUG */
+		return(latch_leaves);
+	case BTR_CONT_MODIFY_TREE:
+		ut_ad(dict_index_is_spatial(cursor->index));
+		return(latch_leaves);
+	}
+
+	ut_error;
+	return(latch_leaves);
+}
+
+/** Load the instant ALTER TABLE metadata from the clustered index
+when loading a table definition.
+@param[in,out]	index	clustered index definition
+@param[in,out]	mtr	mini-transaction
+@return	error code
+@retval	DB_SUCCESS	if no error occurred
+@retval	DB_CORRUPTION	if any corruption was noticed */
+static dberr_t btr_cur_instant_init_low(dict_index_t* index, mtr_t* mtr)
+{
+	ut_ad(index->is_primary());
+	ut_ad(index->n_core_null_bytes == dict_index_t::NO_CORE_NULL_BYTES);
+	ut_ad(index->table->supports_instant());
+	ut_ad(index->table->is_readable());
+
+	const fil_space_t* space = index->table->space;
+	if (!space) {
+unreadable:
+		ib::error() << "Table " << index->table->name
+			    << " has an unreadable root page";
+		index->table->corrupted = true;
+		return DB_CORRUPTION;
+	}
+
+	page_t* root = btr_root_get(index, mtr);
+
+	if (!root || btr_cur_instant_root_init(index, root)) {
+		goto unreadable;
+	}
+
+	ut_ad(index->n_core_null_bytes != dict_index_t::NO_CORE_NULL_BYTES);
+
+	if (fil_page_get_type(root) == FIL_PAGE_INDEX) {
+		ut_ad(!index->is_instant());
+		return DB_SUCCESS;
+	}
+
+	btr_cur_t cur;
+	/* Relax the assertion in rec_init_offsets(). */
+	ut_ad(!index->in_instant_init);
+	ut_d(index->in_instant_init = true);
+	dberr_t err = btr_cur_open_at_index_side(true, index, BTR_SEARCH_LEAF,
+						 &cur, 0, mtr);
+	ut_d(index->in_instant_init = false);
+	if (err != DB_SUCCESS) {
+		index->table->corrupted = true;
+		return err;
+	}
+
+	ut_ad(page_cur_is_before_first(&cur.page_cur));
+	ut_ad(page_is_leaf(cur.page_cur.block->frame));
+
+	page_cur_move_to_next(&cur.page_cur);
+
+	const rec_t* rec = cur.page_cur.rec;
+	const ulint comp = dict_table_is_comp(index->table);
+	const ulint info_bits = rec_get_info_bits(rec, comp);
+
+	if (page_rec_is_supremum(rec)
+	    || !(info_bits & REC_INFO_MIN_REC_FLAG)) {
+		if (!index->is_instant()) {
+			/* The FIL_PAGE_TYPE_INSTANT and PAGE_INSTANT may be
+			assigned even if instant ADD COLUMN was not
+			committed. Changes to these page header fields are not
+			undo-logged, but changes to the hidden metadata record
+			are. If the server is killed and restarted, the page
+			header fields could remain set even though no metadata
+			record is present. */
+			return DB_SUCCESS;
+		}
+
+		ib::error() << "Table " << index->table->name
+			    << " is missing instant ALTER metadata";
+		index->table->corrupted = true;
+		return DB_CORRUPTION;
+	}
+
+	if ((info_bits & ~REC_INFO_DELETED_FLAG) != REC_INFO_MIN_REC_FLAG
+	    || (comp && rec_get_status(rec) != REC_STATUS_INSTANT)) {
+incompatible:
+		ib::error() << "Table " << index->table->name
+			<< " contains unrecognizable instant ALTER metadata";
+		index->table->corrupted = true;
+		return DB_CORRUPTION;
+	}
+
+	/* Read the metadata. We can get here on server restart
+	or when the table was evicted from the data dictionary cache
+	and is now being accessed again.
+
+	Here, READ COMMITTED and REPEATABLE READ should be equivalent.
+	Committing the ADD COLUMN operation would acquire
+	MDL_EXCLUSIVE and LOCK_X|LOCK_TABLE, which would prevent any
+	concurrent operations on the table, including table eviction
+	from the cache. */
+
+	if (info_bits & REC_INFO_DELETED_FLAG) {
+		/* This metadata record includes a BLOB that identifies
+		any dropped or reordered columns. */
+		ulint trx_id_offset = index->trx_id_offset;
+		/* If !index->trx_id_offset, the PRIMARY KEY contains
+		variable-length columns. For the metadata record,
+		variable-length columns should be written with zero
+		length. However, before MDEV-21088 was fixed, for
+		variable-length encoded PRIMARY KEY column of type
+		CHAR, we wrote more than zero bytes. That is why we
+		must determine the actual length of each PRIMARY KEY
+		column.  The DB_TRX_ID will start right after any
+		PRIMARY KEY columns. */
+		ut_ad(index->n_uniq);
+
+		/* We cannot invoke rec_get_offsets() before
+		index->table->deserialise_columns(). Therefore,
+		we must duplicate some logic here. */
+		if (trx_id_offset) {
+		} else if (index->table->not_redundant()) {
+			/* The PRIMARY KEY contains variable-length columns.
+			For the metadata record, variable-length columns are
+			always written with zero length. The DB_TRX_ID will
+			start right after any fixed-length columns. */
+
+			/* OK, before MDEV-21088 was fixed, for
+			variable-length encoded PRIMARY KEY column of
+			type CHAR, we wrote more than zero bytes. In
+			order to allow affected tables to be accessed,
+			it would be nice to determine the actual
+			length of each PRIMARY KEY column. However, to
+			be able to do that, we should determine the
+			size of the null-bit bitmap in the metadata
+			record. And we cannot know that before reading
+			the metadata BLOB, whose starting point we are
+			trying to find here. (Although the PRIMARY KEY
+			columns cannot be NULL, we would have to know
+			where the lengths of variable-length PRIMARY KEY
+			columns start.)
+
+			So, unfortunately we cannot help users who
+			were affected by MDEV-21088 on a ROW_FORMAT=COMPACT
+			or ROW_FORMAT=DYNAMIC table. */
+
+			for (uint i = index->n_uniq; i--; ) {
+				trx_id_offset += index->fields[i].fixed_len;
+			}
+		} else if (rec_get_1byte_offs_flag(rec)) {
+			trx_id_offset = rec_1_get_field_end_info(
+				rec, index->n_uniq - 1);
+			ut_ad(!(trx_id_offset & REC_1BYTE_SQL_NULL_MASK));
+			trx_id_offset &= ~REC_1BYTE_SQL_NULL_MASK;
+		} else {
+			trx_id_offset = rec_2_get_field_end_info(
+				rec, index->n_uniq - 1);
+			ut_ad(!(trx_id_offset & REC_2BYTE_SQL_NULL_MASK));
+			trx_id_offset &= ~REC_2BYTE_SQL_NULL_MASK;
+		}
+
+		const byte* ptr = rec + trx_id_offset
+			+ (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+		if (mach_read_from_4(ptr + BTR_EXTERN_LEN)) {
+			goto incompatible;
+		}
+
+		uint len = mach_read_from_4(ptr + BTR_EXTERN_LEN + 4);
+		if (!len
+		    || mach_read_from_4(ptr + BTR_EXTERN_OFFSET)
+		    != FIL_PAGE_DATA
+		    || mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID)
+		    != space->id) {
+			goto incompatible;
+		}
+
+		buf_block_t* block = buf_page_get(
+			page_id_t(space->id,
+				  mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)),
+			0, RW_S_LATCH, mtr);
+		buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
+		if (fil_page_get_type(block->frame) != FIL_PAGE_TYPE_BLOB
+		    || mach_read_from_4(&block->frame[FIL_PAGE_DATA
+						      + BTR_BLOB_HDR_NEXT_PAGE_NO])
+		    != FIL_NULL
+		    || mach_read_from_4(&block->frame[FIL_PAGE_DATA
+						      + BTR_BLOB_HDR_PART_LEN])
+		    != len) {
+			goto incompatible;
+		}
+
+		/* The unused part of the BLOB page should be zero-filled. */
+		for (const byte* b = block->frame
+		       + (FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE) + len,
+		       * const end = block->frame + srv_page_size
+		       - BTR_EXTERN_LEN;
+		     b < end; ) {
+			if (*b++) {
+				goto incompatible;
+			}
+		}
+
+		if (index->table->deserialise_columns(
+			    &block->frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE],
+			    len)) {
+			goto incompatible;
+		}
+
+		/* Proceed to initialize the default values of
+		any instantly added columns. */
+	}
+
+	mem_heap_t* heap = NULL;
+	rec_offs* offsets = rec_get_offsets(rec, index, NULL,
+					    index->n_core_fields,
+					    ULINT_UNDEFINED, &heap);
+	if (rec_offs_any_default(offsets)) {
+inconsistent:
+		mem_heap_free(heap);
+		goto incompatible;
+	}
+
+	/* In fact, because we only ever append fields to the metadata
+	record, it is also OK to perform READ UNCOMMITTED and
+	then ignore any extra fields, provided that
+	trx_sys.is_registered(DB_TRX_ID). */
+	if (rec_offs_n_fields(offsets)
+	    > ulint(index->n_fields) + !!index->table->instant
+	    && !trx_sys.is_registered(current_trx(),
+				      row_get_rec_trx_id(rec, index,
+							 offsets))) {
+		goto inconsistent;
+	}
+
+	for (unsigned i = index->n_core_fields; i < index->n_fields; i++) {
+		dict_col_t* col = index->fields[i].col;
+		const unsigned o = i + !!index->table->instant;
+		ulint len;
+		const byte* data = rec_get_nth_field(rec, offsets, o, &len);
+		ut_ad(!col->is_added());
+		ut_ad(!col->def_val.data);
+		col->def_val.len = len;
+		switch (len) {
+		case UNIV_SQL_NULL:
+			continue;
+		case 0:
+			col->def_val.data = field_ref_zero;
+			continue;
+		}
+		ut_ad(len != UNIV_SQL_DEFAULT);
+		if (!rec_offs_nth_extern(offsets, o)) {
+			col->def_val.data = mem_heap_dup(
+				index->table->heap, data, len);
+		} else if (len < BTR_EXTERN_FIELD_REF_SIZE
+			   || !memcmp(data + len - BTR_EXTERN_FIELD_REF_SIZE,
+				      field_ref_zero,
+				      BTR_EXTERN_FIELD_REF_SIZE)) {
+			col->def_val.len = UNIV_SQL_DEFAULT;
+			goto inconsistent;
+		} else {
+			col->def_val.data = btr_copy_externally_stored_field(
+				&col->def_val.len, data,
+				cur.page_cur.block->zip_size(),
+				len, index->table->heap);
+		}
+	}
+
+	mem_heap_free(heap);
+	return DB_SUCCESS;
+}
+
+/** Load the instant ALTER TABLE metadata from the clustered index
+when loading a table definition.
+@param[in,out]	table	table definition from the data dictionary
+@return	error code
+@retval	DB_SUCCESS	if no error occurred */
+dberr_t
+btr_cur_instant_init(dict_table_t* table)
+{
+	mtr_t		mtr;
+	dict_index_t*	index = dict_table_get_first_index(table);
+	mtr.start();
+	dberr_t	err = index
+		? btr_cur_instant_init_low(index, &mtr)
+		: DB_CORRUPTION;
+	mtr.commit();
+	return(err);
+}
+
+/** Initialize the n_core_null_bytes on first access to a clustered
+index root page.
+@param[in]	index	clustered index that is on its first access
+@param[in]	page	clustered index root page
+@return	whether the page is corrupted */
+bool btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
+{
+	ut_ad(!index->is_dummy);
+	ut_ad(fil_page_index_page_check(page));
+	ut_ad(!page_has_siblings(page));
+	ut_ad(page_get_space_id(page) == index->table->space_id);
+	ut_ad(page_get_page_no(page) == index->page);
+	ut_ad(!page_is_comp(page) == !dict_table_is_comp(index->table));
+	ut_ad(index->is_primary());
+	ut_ad(!index->is_instant());
+	ut_ad(index->table->supports_instant());
+	/* This is normally executed as part of btr_cur_instant_init()
+	when dict_load_table_one() is loading a table definition.
+	Other threads should not access or modify the n_core_null_bytes,
+	n_core_fields before dict_load_table_one() returns.
+
+	This can also be executed during IMPORT TABLESPACE, where the
+	table definition is exclusively locked. */
+
+	switch (fil_page_get_type(page)) {
+	default:
+		ut_ad("wrong page type" == 0);
+		return true;
+	case FIL_PAGE_INDEX:
+		/* The field PAGE_INSTANT is guaranteed 0 on clustered
+		index root pages of ROW_FORMAT=COMPACT or
+		ROW_FORMAT=DYNAMIC when instant ADD COLUMN is not used. */
+		ut_ad(!page_is_comp(page) || !page_get_instant(page));
+		index->n_core_null_bytes = static_cast<uint8_t>(
+			UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
+		return false;
+	case FIL_PAGE_TYPE_INSTANT:
+		break;
+	}
+
+	const uint16_t n = page_get_instant(page);
+
+	if (n < index->n_uniq + DATA_ROLL_PTR) {
+		/* The PRIMARY KEY (or hidden DB_ROW_ID) and
+		DB_TRX_ID,DB_ROLL_PTR columns must always be present
+		as 'core' fields. */
+		return true;
+	}
+
+	if (n > REC_MAX_N_FIELDS) {
+		return true;
+	}
+
+	index->n_core_fields = n & dict_index_t::MAX_N_FIELDS;
+
+	const rec_t* infimum = page_get_infimum_rec(page);
+	const rec_t* supremum = page_get_supremum_rec(page);
+
+	if (!memcmp(infimum, "infimum", 8)
+	    && !memcmp(supremum, "supremum", 8)) {
+		if (n > index->n_fields) {
+			/* All fields, including those for instantly
+			added columns, must be present in the
+			data dictionary. */
+			return true;
+		}
+
+		ut_ad(!index->is_dummy);
+		ut_d(index->is_dummy = true);
+		index->n_core_null_bytes = static_cast<uint8_t>(
+			UT_BITS_IN_BYTES(index->get_n_nullable(n)));
+		ut_d(index->is_dummy = false);
+		return false;
+	}
+
+	if (memcmp(infimum, field_ref_zero, 8)
+	    || memcmp(supremum, field_ref_zero, 7)) {
+		/* The infimum and supremum records must either contain
+		the original strings, or they must be filled with zero
+		bytes, except for the bytes that we have repurposed. */
+		return true;
+	}
+
+	index->n_core_null_bytes = supremum[7];
+	return index->n_core_null_bytes > 128;
+}
+
+/** Optimistically latches the leaf page or pages requested.
+@param[in]	block		guessed buffer block
+@param[in]	modify_clock	modify clock value
+@param[in,out]	latch_mode	BTR_SEARCH_LEAF, ...
+@param[in,out]	cursor		cursor
+@param[in]	file		file name
+@param[in]	line		line where called
+@param[in]	mtr		mini-transaction
+@return true if success */
+bool
+btr_cur_optimistic_latch_leaves(
+	buf_block_t*	block,
+	ib_uint64_t	modify_clock,
+	ulint*		latch_mode,
+	btr_cur_t*	cursor,
+	const char*	file,
+	unsigned	line,
+	mtr_t*		mtr)
+{
+	ut_ad(block->page.buf_fix_count());
+	ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+
+	switch (*latch_mode) {
+	default:
+		ut_error;
+		return(false);
+	case BTR_SEARCH_LEAF:
+	case BTR_MODIFY_LEAF:
+		return(buf_page_optimistic_get(*latch_mode, block,
+				modify_clock, file, line, mtr));
+	case BTR_SEARCH_PREV:
+	case BTR_MODIFY_PREV:
+		rw_lock_s_lock(&block->lock);
+		if (block->modify_clock != modify_clock) {
+			rw_lock_s_unlock(&block->lock);
+			return false;
+		}
+		const uint32_t curr_page_no = block->page.id().page_no();
+		const uint32_t left_page_no = btr_page_get_prev(block->frame);
+		rw_lock_s_unlock(&block->lock);
+
+		const rw_lock_type_t mode = *latch_mode == BTR_SEARCH_PREV
+			? RW_S_LATCH : RW_X_LATCH;
+
+		if (left_page_no != FIL_NULL) {
+			dberr_t	err = DB_SUCCESS;
+			cursor->left_block = buf_page_get_gen(
+				page_id_t(cursor->index->table->space_id,
+					  left_page_no),
+				cursor->index->table->space->zip_size(),
+				mode, nullptr, BUF_GET_POSSIBLY_FREED,
+				__FILE__, __LINE__, mtr, &err);
+
+			if (!cursor->left_block) {
+				cursor->index->table->file_unreadable = true;
+			}
+
+			if (cursor->left_block->page.status
+			    == buf_page_t::FREED
+			    || btr_page_get_next(cursor->left_block->frame)
+			    != curr_page_no) {
+				/* release the left block */
+				btr_leaf_page_release(
+					cursor->left_block, mode, mtr);
+				return false;
+			}
+		} else {
+			cursor->left_block = NULL;
+		}
+
+		if (buf_page_optimistic_get(mode, block, modify_clock,
+					    file, line, mtr)) {
+			if (btr_page_get_prev(block->frame) == left_page_no) {
+				/* block was already buffer-fixed while
+				entering the function and
+				buf_page_optimistic_get() buffer-fixes
+				it again. */
+				ut_ad(2 <= block->page.buf_fix_count());
+				*latch_mode = mode;
+				return(true);
+			} else {
+				/* release the block and decrement of
+				buf_fix_count which was incremented
+				in buf_page_optimistic_get() */
+				btr_leaf_page_release(block, mode, mtr);
+			}
+		}
+
+		ut_ad(block->page.buf_fix_count());
+		/* release the left block */
+		if (cursor->left_block != NULL) {
+			btr_leaf_page_release(cursor->left_block,
+					      mode, mtr);
+		}
+	}
+
+	return false;
+}
+
+/**
+Gets intention in btr_intention_t from latch_mode, and cleares the intention
+at the latch_mode.
+@param latch_mode	in/out: pointer to latch_mode
+@return intention for latching tree */
+static
+btr_intention_t
+btr_cur_get_and_clear_intention(
+	ulint	*latch_mode)
+{
+	btr_intention_t	intention;
+
+	switch (*latch_mode & (BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)) {
+	case BTR_LATCH_FOR_INSERT:
+		intention = BTR_INTENTION_INSERT;
+		break;
+	case BTR_LATCH_FOR_DELETE:
+		intention = BTR_INTENTION_DELETE;
+		break;
+	default:
+		/* both or unknown */
+		intention = BTR_INTENTION_BOTH;
+	}
+	*latch_mode &= ulint(~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE));
+
+	return(intention);
+}
+
+/**
+Gets the desired latch type for the root leaf (root page is root leaf)
+at the latch mode.
+@param latch_mode	in: BTR_SEARCH_LEAF, ...
+@return latch type */
+static
+rw_lock_type_t
+btr_cur_latch_for_root_leaf(
+	ulint	latch_mode)
+{
+	switch (latch_mode) {
+	case BTR_SEARCH_LEAF:
+	case BTR_SEARCH_TREE:
+	case BTR_SEARCH_PREV:
+		return(RW_S_LATCH);
+	case BTR_MODIFY_LEAF:
+	case BTR_MODIFY_TREE:
+	case BTR_MODIFY_PREV:
+		return(RW_X_LATCH);
+	case BTR_CONT_MODIFY_TREE:
+	case BTR_CONT_SEARCH_TREE:
+		/* A root page should be latched already,
+		and don't need to be latched here.
+		fall through (RW_NO_LATCH) */
+	case BTR_NO_LATCHES:
+		return(RW_NO_LATCH);
+	}
+
+	ut_error;
+	return(RW_NO_LATCH); /* avoid compiler warnings */
+}
+
+/** Detects whether the modifying record might need a modifying tree structure.
+@param[in]	index		index
+@param[in]	page		page
+@param[in]	lock_intention	lock intention for the tree operation
+@param[in]	rec		record (current node_ptr)
+@param[in]	rec_size	size of the record or max size of node_ptr
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	mtr		mtr
+@return true if tree modification is needed */
+static
+bool
+btr_cur_will_modify_tree(
+	dict_index_t*	index,
+	const page_t*	page,
+	btr_intention_t	lock_intention,
+	const rec_t*	rec,
+	ulint		rec_size,
+	ulint		zip_size,
+	mtr_t*		mtr)
+{
+	ut_ad(!page_is_leaf(page));
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+
+	/* Pessimistic delete of the first record causes delete & insert
+	of node_ptr at upper level. And a subsequent page shrink is
+	possible. It causes delete of node_ptr at the upper level.
+	So we should pay attention also to 2nd record not only
+	first record and last record. Because if the "delete & insert" are
+	done for the different page, the 2nd record become
+	first record and following compress might delete the record and causes
+	the uppper level node_ptr modification. */
+
+	const ulint n_recs = page_get_n_recs(page);
+
+	if (lock_intention <= BTR_INTENTION_BOTH) {
+		compile_time_assert(BTR_INTENTION_DELETE < BTR_INTENTION_BOTH);
+		compile_time_assert(BTR_INTENTION_BOTH < BTR_INTENTION_INSERT);
+
+		if (!page_has_siblings(page)) {
+			return true;
+		}
+
+		ulint margin = rec_size;
+
+		if (lock_intention == BTR_INTENTION_BOTH) {
+			ulint	level = btr_page_get_level(page);
+
+			/* This value is the worst expectation for the node_ptr
+			records to be deleted from this page. It is used to
+			expect whether the cursor position can be the left_most
+			record in this page or not. */
+			ulint   max_nodes_deleted = 0;
+
+			/* By modifying tree operations from the under of this
+			level, logically (2 ^ (level - 1)) opportunities to
+			deleting records in maximum even unreally rare case. */
+			if (level > 7) {
+				/* TODO: adjust this practical limit. */
+				max_nodes_deleted = 64;
+			} else if (level > 0) {
+				max_nodes_deleted = (ulint)1 << (level - 1);
+			}
+			/* check delete will cause. (BTR_INTENTION_BOTH
+			or BTR_INTENTION_DELETE) */
+			if (n_recs <= max_nodes_deleted * 2
+			    || page_rec_is_first(rec, page)) {
+				/* The cursor record can be the left most record
+				in this page. */
+				return true;
+			}
+
+			if (page_has_prev(page)
+			    && page_rec_distance_is_at_most(
+				    page_get_infimum_rec(page), rec,
+				    max_nodes_deleted)) {
+				return true;
+			}
+
+			if (page_has_next(page)
+			    && page_rec_distance_is_at_most(
+				    rec, page_get_supremum_rec(page),
+				    max_nodes_deleted)) {
+				return true;
+			}
+
+			/* Delete at leftmost record in a page causes delete
+			& insert at its parent page. After that, the delete
+			might cause btr_compress() and delete record at its
+			parent page. Thus we should consider max deletes. */
+			margin *= max_nodes_deleted;
+		}
+
+		/* Safe because we already have SX latch of the index tree */
+		if (page_get_data_size(page)
+		    < margin + BTR_CUR_PAGE_COMPRESS_LIMIT(index)) {
+			return(true);
+		}
+	}
+
+	if (lock_intention >= BTR_INTENTION_BOTH) {
+		/* check insert will cause. BTR_INTENTION_BOTH
+		or BTR_INTENTION_INSERT*/
+
+		/* Once we invoke the btr_cur_limit_optimistic_insert_debug,
+		we should check it here in advance, since the max allowable
+		records in a page is limited. */
+		LIMIT_OPTIMISTIC_INSERT_DEBUG(n_recs, return true);
+
+		/* needs 2 records' space for the case the single split and
+		insert cannot fit.
+		page_get_max_insert_size_after_reorganize() includes space
+		for page directory already */
+		ulint	max_size
+			= page_get_max_insert_size_after_reorganize(page, 2);
+
+		if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + rec_size
+		    || max_size < rec_size * 2) {
+			return(true);
+		}
+
+		/* TODO: optimize this condition for ROW_FORMAT=COMPRESSED.
+		This is based on the worst case, and we could invoke
+		page_zip_available() on the block->page.zip. */
+		/* needs 2 records' space also for worst compress rate. */
+		if (zip_size
+		    && page_zip_empty_size(index->n_fields, zip_size)
+		    <= rec_size * 2 + page_get_data_size(page)
+		    + page_dir_calc_reserved_space(n_recs + 2)) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/** Detects whether the modifying record might need a opposite modification
+to the intention.
+@param[in]	page		page
+@param[in]	lock_intention	lock intention for the tree operation
+@param[in]	rec		record (current node_ptr)
+@return	true if tree modification is needed */
+static
+bool
+btr_cur_need_opposite_intention(
+	const page_t*	page,
+	btr_intention_t	lock_intention,
+	const rec_t*	rec)
+{
+	switch (lock_intention) {
+	case BTR_INTENTION_DELETE:
+		return (page_has_prev(page) && page_rec_is_first(rec, page)) ||
+			(page_has_next(page) && page_rec_is_last(rec, page));
+	case BTR_INTENTION_INSERT:
+		return page_has_next(page) && page_rec_is_last(rec, page);
+	case BTR_INTENTION_BOTH:
+		return(false);
+	}
+
+	ut_error;
+	return(false);
+}
+
+/**
+@param[in]	index b-tree
+@return maximum size of a node pointer record in bytes */
+static ulint btr_node_ptr_max_size(const dict_index_t* index)
+{
+	if (dict_index_is_ibuf(index)) {
+		/* cannot estimate accurately */
+		/* This is universal index for change buffer.
+		The max size of the entry is about max key length * 2.
+		(index key + primary key to be inserted to the index)
+		(The max key length is UNIV_PAGE_SIZE / 16 * 3 at
+		 ha_innobase::max_supported_key_length(),
+		 considering MAX_KEY_LENGTH = 3072 at MySQL imposes
+		 the 3500 historical InnoDB value for 16K page size case.)
+		For the universal index, node_ptr contains most of the entry.
+		And 512 is enough to contain ibuf columns and meta-data */
+		return srv_page_size / 8 * 3 + 512;
+	}
+
+	/* Each record has page_no, length of page_no and header. */
+	ulint comp = dict_table_is_comp(index->table);
+	ulint rec_max_size = comp
+		? REC_NODE_PTR_SIZE + 1 + REC_N_NEW_EXTRA_BYTES
+		+ UT_BITS_IN_BYTES(index->n_nullable)
+		: REC_NODE_PTR_SIZE + 2 + REC_N_OLD_EXTRA_BYTES
+		+ 2 * index->n_fields;
+
+	/* Compute the maximum possible record size. */
+	for (ulint i = 0; i < dict_index_get_n_unique_in_tree(index); i++) {
+		const dict_field_t*	field
+			= dict_index_get_nth_field(index, i);
+		const dict_col_t*	col
+			= dict_field_get_col(field);
+		ulint			field_max_size;
+		ulint			field_ext_max_size;
+
+		/* Determine the maximum length of the index field. */
+
+		field_max_size = dict_col_get_fixed_size(col, comp);
+		if (field_max_size) {
+			/* dict_index_add_col() should guarantee this */
+			ut_ad(!field->prefix_len
+			      || field->fixed_len == field->prefix_len);
+			/* Fixed lengths are not encoded
+			in ROW_FORMAT=COMPACT. */
+			rec_max_size += field_max_size;
+			continue;
+		}
+
+		field_max_size = dict_col_get_max_size(col);
+		if (UNIV_UNLIKELY(!field_max_size)) {
+			switch (col->mtype) {
+			case DATA_VARCHAR:
+				if (!comp
+				    && (!strcmp(index->table->name.m_name,
+						"SYS_FOREIGN")
+					|| !strcmp(index->table->name.m_name,
+						   "SYS_FOREIGN_COLS"))) {
+					break;
+				}
+				/* fall through */
+			case DATA_VARMYSQL:
+			case DATA_CHAR:
+			case DATA_MYSQL:
+				/* CHAR(0) and VARCHAR(0) are possible
+				data type definitions in MariaDB.
+				The InnoDB internal SQL parser maps
+				CHAR to DATA_VARCHAR, so DATA_CHAR (or
+				DATA_MYSQL) is only coming from the
+				MariaDB SQL layer. */
+				if (comp) {
+					/* Add a length byte, because
+					fixed-length empty field are
+					encoded as variable-length.
+					For ROW_FORMAT=REDUNDANT,
+					these bytes were added to
+					rec_max_size before this loop. */
+					rec_max_size++;
+				}
+				continue;
+			}
+
+			/* SYS_FOREIGN.ID is defined as CHAR in the
+			InnoDB internal SQL parser, which translates
+			into the incorrect VARCHAR(0).  InnoDB does
+			not enforce maximum lengths of columns, so
+			that is why any data can be inserted in the
+			first place.
+
+			Likewise, SYS_FOREIGN.FOR_NAME,
+			SYS_FOREIGN.REF_NAME, SYS_FOREIGN_COLS.ID, are
+			defined as CHAR, and also they are part of a key. */
+
+			ut_ad(!strcmp(index->table->name.m_name,
+				      "SYS_FOREIGN")
+			      || !strcmp(index->table->name.m_name,
+					 "SYS_FOREIGN_COLS"));
+			ut_ad(!comp);
+			ut_ad(col->mtype == DATA_VARCHAR);
+
+			rec_max_size += (srv_page_size == UNIV_PAGE_SIZE_MAX)
+				? REDUNDANT_REC_MAX_DATA_SIZE
+				: page_get_free_space_of_empty(FALSE) / 2;
+		} else if (field_max_size == NAME_LEN && i == 1
+			   && (!strcmp(index->table->name.m_name,
+				       TABLE_STATS_NAME)
+			       || !strcmp(index->table->name.m_name,
+					  INDEX_STATS_NAME))) {
+			/* Interpret "table_name" as VARCHAR(199) even
+			if it was incorrectly defined as VARCHAR(64).
+			While the caller of ha_innobase enforces the
+			maximum length on any data written, the InnoDB
+			internal SQL parser will happily write as much
+			data as is provided. The purpose of this hack
+			is to avoid InnoDB hangs after persistent
+			statistics on partitioned tables are
+			deleted. */
+			field_max_size = 199 * SYSTEM_CHARSET_MBMAXLEN;
+		}
+		field_ext_max_size = field_max_size < 256 ? 1 : 2;
+
+		if (field->prefix_len
+		    && field->prefix_len < field_max_size) {
+			field_max_size = field->prefix_len;
+		}
+
+		if (comp) {
+			/* Add the extra size for ROW_FORMAT=COMPACT.
+			For ROW_FORMAT=REDUNDANT, these bytes were
+			added to rec_max_size before this loop. */
+			rec_max_size += field_ext_max_size;
+		}
+
+		rec_max_size += field_max_size;
+	}
+
+	return rec_max_size;
+}
+
+/********************************************************************//**
+Searches an index tree and positions a tree cursor on a given level.
+NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
+to node pointer page number fields on the upper levels of the tree!
+Note that if mode is PAGE_CUR_LE, which is used in inserts, then
+cursor->up_match and cursor->low_match both will have sensible values.
+If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
+
+If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
+search tuple should be performed in the B-tree. InnoDB does an insert
+immediately after the cursor. Thus, the cursor may end up on a user record,
+or on a page infimum record. */
+dberr_t
+btr_cur_search_to_nth_level_func(
+	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: the tree level of search */
+	const dtuple_t*	tuple,	/*!< in: data tuple; NOTE: n_fields_cmp in
+				tuple must be set so that it cannot get
+				compared to the node ptr page number field! */
+	page_cur_mode_t	mode,	/*!< in: PAGE_CUR_L, ...;
+				Inserts should always be made using
+				PAGE_CUR_LE to search the position! */
+	ulint		latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
+				at most one of BTR_INSERT, BTR_DELETE_MARK,
+				BTR_DELETE, or BTR_ESTIMATE;
+				cursor->left_block is used to store a pointer
+				to the left neighbor page, in the cases
+				BTR_SEARCH_PREV and BTR_MODIFY_PREV;
+				NOTE that if ahi_latch, we might not have a
+				cursor page latch, we assume that ahi_latch
+				protects the record! */
+	btr_cur_t*	cursor, /*!< in/out: tree cursor; the cursor page is
+				s- or x-latched, but see also above! */
+#ifdef BTR_CUR_HASH_ADAPT
+	rw_lock_t*	ahi_latch,
+				/*!< in: currently held btr_search_latch
+				(in RW_S_LATCH mode), or NULL */
+#endif /* BTR_CUR_HASH_ADAPT */
+	const char*	file,	/*!< in: file name */
+	unsigned	line,	/*!< in: line where called */
+	mtr_t*		mtr,	/*!< in: mtr */
+	ib_uint64_t	autoinc)/*!< in: PAGE_ROOT_AUTO_INC to be written
+				(0 if none) */
+{
+	page_t*		page = NULL; /* remove warning */
+	buf_block_t*	block;
+	buf_block_t*	guess;
+	ulint		height;
+	ulint		up_match;
+	ulint		up_bytes;
+	ulint		low_match;
+	ulint		low_bytes;
+	ulint		rw_latch;
+	page_cur_mode_t	page_mode;
+	page_cur_mode_t	search_mode = PAGE_CUR_UNSUPP;
+	ulint		buf_mode;
+	ulint		estimate;
+	ulint		node_ptr_max_size = srv_page_size / 2;
+	page_cur_t*	page_cursor;
+	btr_op_t	btr_op;
+	ulint		root_height = 0; /* remove warning */
+	dberr_t		err = DB_SUCCESS;
+
+	btr_intention_t	lock_intention;
+	bool		modify_external;
+	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
+	ulint		tree_savepoints[BTR_MAX_LEVELS];
+	ulint		n_blocks = 0;
+	ulint		n_releases = 0;
+	bool		detected_same_key_root = false;
+
+	bool		retrying_for_search_prev = false;
+	ulint		leftmost_from_level = 0;
+	buf_block_t**	prev_tree_blocks = NULL;
+	ulint*		prev_tree_savepoints = NULL;
+	ulint		prev_n_blocks = 0;
+	ulint		prev_n_releases = 0;
+	bool		need_path = true;
+	bool		rtree_parent_modified = false;
+	bool		mbr_adj = false;
+	bool		found = false;
+
+	DBUG_ENTER("btr_cur_search_to_nth_level");
+
+#ifdef BTR_CUR_ADAPT
+	btr_search_t*	info;
+#endif /* BTR_CUR_ADAPT */
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs	offsets2_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets2	= offsets2_;
+	rec_offs_init(offsets_);
+	rec_offs_init(offsets2_);
+	/* Currently, PAGE_CUR_LE is the only search mode used for searches
+	ending to upper levels */
+
+	ut_ad(level == 0 || mode == PAGE_CUR_LE
+	      || RTREE_SEARCH_MODE(mode));
+	ut_ad(dict_index_check_search_tuple(index, tuple));
+	ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr));
+	ut_ad(dtuple_check_typed(tuple));
+	ut_ad(!(index->type & DICT_FTS));
+	ut_ad(index->page != FIL_NULL);
+
+	MEM_UNDEFINED(&cursor->up_match, sizeof cursor->up_match);
+	MEM_UNDEFINED(&cursor->up_bytes, sizeof cursor->up_bytes);
+	MEM_UNDEFINED(&cursor->low_match, sizeof cursor->low_match);
+	MEM_UNDEFINED(&cursor->low_bytes, sizeof cursor->low_bytes);
+#ifdef UNIV_DEBUG
+	cursor->up_match = ULINT_UNDEFINED;
+	cursor->low_match = ULINT_UNDEFINED;
+#endif /* UNIV_DEBUG */
+
+	ibool	s_latch_by_caller;
+
+	s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
+
+	ut_ad(!s_latch_by_caller
+	      || srv_read_only_mode
+	      || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_S_LOCK
+					    | MTR_MEMO_SX_LOCK));
+
+	/* These flags are mutually exclusive, they are lumped together
+	with the latch mode for historical reasons. It's possible for
+	none of the flags to be set. */
+	switch (UNIV_EXPECT(latch_mode
+			    & (BTR_INSERT | BTR_DELETE | BTR_DELETE_MARK),
+			    0)) {
+	case 0:
+		btr_op = BTR_NO_OP;
+		break;
+	case BTR_INSERT:
+		btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE)
+			? BTR_INSERT_IGNORE_UNIQUE_OP
+			: BTR_INSERT_OP;
+		break;
+	case BTR_DELETE:
+		btr_op = BTR_DELETE_OP;
+		ut_a(cursor->purge_node);
+		break;
+	case BTR_DELETE_MARK:
+		btr_op = BTR_DELMARK_OP;
+		break;
+	default:
+		/* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK
+		should be specified at a time */
+		ut_error;
+	}
+
+	/* Operations on the insert buffer tree cannot be buffered. */
+	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index));
+	/* Operations on the clustered index cannot be buffered. */
+	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index));
+	/* Operations on the temporary table(indexes) cannot be buffered. */
+	ut_ad(btr_op == BTR_NO_OP || !index->table->is_temporary());
+	/* Operation on the spatial index cannot be buffered. */
+	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_spatial(index));
+
+	estimate = latch_mode & BTR_ESTIMATE;
+
+	lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
+
+	modify_external = latch_mode & BTR_MODIFY_EXTERNAL;
+
+	/* Turn the flags unrelated to the latch mode off. */
+	latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+
+	ut_ad(!modify_external || latch_mode == BTR_MODIFY_LEAF);
+
+	ut_ad(!s_latch_by_caller
+	      || latch_mode == BTR_SEARCH_LEAF
+	      || latch_mode == BTR_SEARCH_TREE
+	      || latch_mode == BTR_MODIFY_LEAF);
+
+	ut_ad(autoinc == 0 || dict_index_is_clust(index));
+	ut_ad(autoinc == 0
+	      || latch_mode == BTR_MODIFY_TREE
+	      || latch_mode == BTR_MODIFY_LEAF);
+	ut_ad(autoinc == 0 || level == 0);
+
+	cursor->flag = BTR_CUR_BINARY;
+	cursor->index = index;
+
+#ifndef BTR_CUR_ADAPT
+	guess = NULL;
+#else
+	info = btr_search_get_info(index);
+	guess = info->root_guess;
+
+#ifdef BTR_CUR_HASH_ADAPT
+
+# ifdef UNIV_SEARCH_PERF_STAT
+	info->n_searches++;
+# endif
+	if (autoinc == 0
+	    && latch_mode <= BTR_MODIFY_LEAF
+	    && info->last_hash_succ
+# ifdef MYSQL_INDEX_DISABLE_AHI
+	    && !index->disable_ahi
+# endif
+	    && !estimate
+# ifdef PAGE_CUR_LE_OR_EXTENDS
+	    && mode != PAGE_CUR_LE_OR_EXTENDS
+# endif /* PAGE_CUR_LE_OR_EXTENDS */
+	    && !dict_index_is_spatial(index)
+	    /* If !ahi_latch, we do a dirty read of
+	    btr_search_enabled below, and btr_search_guess_on_hash()
+	    will have to check it again. */
+	    && btr_search_enabled
+	    && !modify_external
+	    && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG)
+	    && btr_search_guess_on_hash(index, info, tuple, mode,
+					latch_mode, cursor,
+					ahi_latch, mtr)) {
+
+		/* Search using the hash index succeeded */
+
+		ut_ad(cursor->up_match != ULINT_UNDEFINED
+		      || mode != PAGE_CUR_GE);
+		ut_ad(cursor->up_match != ULINT_UNDEFINED
+		      || mode != PAGE_CUR_LE);
+		ut_ad(cursor->low_match != ULINT_UNDEFINED
+		      || mode != PAGE_CUR_LE);
+		btr_cur_n_sea++;
+
+		DBUG_RETURN(err);
+	}
+# endif /* BTR_CUR_HASH_ADAPT */
+#endif /* BTR_CUR_ADAPT */
+	btr_cur_n_non_sea++;
+
+	/* If the hash search did not succeed, do binary search down the
+	tree */
+
+#ifdef BTR_CUR_HASH_ADAPT
+	if (ahi_latch) {
+		/* Release possible search latch to obey latching order */
+		rw_lock_s_unlock(ahi_latch);
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	/* Store the position of the tree latch we push to mtr so that we
+	know how to release it when we have latched leaf node(s) */
+
+	ulint savepoint = mtr_set_savepoint(mtr);
+
+	rw_lock_type_t upper_rw_latch;
+
+	switch (latch_mode) {
+	case BTR_MODIFY_TREE:
+		/* Most of delete-intended operations are purging.
+		Free blocks and read IO bandwidth should be prior
+		for them, when the history list is glowing huge. */
+		if (lock_intention == BTR_INTENTION_DELETE
+		    && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
+		    && buf_pool.n_pend_reads) {
+x_latch_index:
+			mtr_x_lock_index(index, mtr);
+		} else if (index->is_spatial()
+			   && lock_intention <= BTR_INTENTION_BOTH) {
+			/* X lock the if there is possibility of
+			pessimistic delete on spatial index. As we could
+			lock upward for the tree */
+			goto x_latch_index;
+		} else {
+			mtr_sx_lock_index(index, mtr);
+		}
+		upper_rw_latch = RW_X_LATCH;
+		break;
+	case BTR_CONT_MODIFY_TREE:
+	case BTR_CONT_SEARCH_TREE:
+		/* Do nothing */
+		ut_ad(srv_read_only_mode
+		      || mtr->memo_contains_flagged(&index->lock,
+						    MTR_MEMO_X_LOCK
+						    | MTR_MEMO_SX_LOCK));
+		if (dict_index_is_spatial(index)
+		    && latch_mode == BTR_CONT_MODIFY_TREE) {
+			/* If we are about to locating parent page for split
+			and/or merge operation for R-Tree index, X latch
+			the parent */
+			upper_rw_latch = RW_X_LATCH;
+		} else {
+			upper_rw_latch = RW_NO_LATCH;
+		}
+		break;
+	default:
+		if (!srv_read_only_mode) {
+			if (s_latch_by_caller) {
+				ut_ad(rw_lock_own(dict_index_get_lock(index),
+				              RW_LOCK_S));
+			} else if (!modify_external) {
+				/* BTR_SEARCH_TREE is intended to be used with
+				BTR_ALREADY_S_LATCHED */
+				ut_ad(latch_mode != BTR_SEARCH_TREE);
+
+				mtr_s_lock_index(index, mtr);
+			} else {
+				/* BTR_MODIFY_EXTERNAL needs to be excluded */
+				mtr_sx_lock_index(index, mtr);
+			}
+			upper_rw_latch = RW_S_LATCH;
+		} else {
+			upper_rw_latch = RW_NO_LATCH;
+		}
+	}
+	const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf(
+		latch_mode);
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	const ulint		zip_size = index->table->space->zip_size();
+
+	/* Start with the root page. */
+	page_id_t		page_id(index->table->space_id, index->page);
+
+	if (root_leaf_rw_latch == RW_X_LATCH) {
+		node_ptr_max_size = btr_node_ptr_max_size(index);
+	}
+
+	up_match = 0;
+	up_bytes = 0;
+	low_match = 0;
+	low_bytes = 0;
+
+	height = ULINT_UNDEFINED;
+
+	/* We use these modified search modes on non-leaf levels of the
+	B-tree. These let us end up in the right B-tree leaf. In that leaf
+	we use the original search mode. */
+
+	switch (mode) {
+	case PAGE_CUR_GE:
+		page_mode = PAGE_CUR_L;
+		break;
+	case PAGE_CUR_G:
+		page_mode = PAGE_CUR_LE;
+		break;
+	default:
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+		ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
+		      || RTREE_SEARCH_MODE(mode)
+		      || mode == PAGE_CUR_LE_OR_EXTENDS);
+#else /* PAGE_CUR_LE_OR_EXTENDS */
+		ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
+		      || RTREE_SEARCH_MODE(mode));
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+		page_mode = mode;
+		break;
+	}
+
+	/* Loop and search until we arrive at the desired level */
+	btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
+
+search_loop:
+	buf_mode = BUF_GET;
+	rw_latch = RW_NO_LATCH;
+	rtree_parent_modified = false;
+
+	if (height != 0) {
+		/* We are about to fetch the root or a non-leaf page. */
+		if ((latch_mode != BTR_MODIFY_TREE || height == level)
+		    && !retrying_for_search_prev) {
+			/* If doesn't have SX or X latch of index,
+			each pages should be latched before reading. */
+			if (height == ULINT_UNDEFINED
+			    && upper_rw_latch == RW_S_LATCH
+			    && (modify_external || autoinc)) {
+				/* needs sx-latch of root page
+				for fseg operation or for writing
+				PAGE_ROOT_AUTO_INC */
+				rw_latch = RW_SX_LATCH;
+			} else {
+				rw_latch = upper_rw_latch;
+			}
+		}
+	} else if (latch_mode <= BTR_MODIFY_LEAF) {
+		rw_latch = latch_mode;
+
+		if (btr_op != BTR_NO_OP
+		    && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) {
+
+			/* Try to buffer the operation if the leaf
+			page is not in the buffer pool. */
+
+			buf_mode = btr_op == BTR_DELETE_OP
+				? BUF_GET_IF_IN_POOL_OR_WATCH
+				: BUF_GET_IF_IN_POOL;
+		}
+	}
+
+retry_page_get:
+	ut_ad(n_blocks < BTR_MAX_LEVELS);
+	tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
+	block = buf_page_get_gen(page_id, zip_size, rw_latch, guess,
+				 buf_mode, file, line, mtr, &err,
+				 height == 0 && !index->is_clust());
+	tree_blocks[n_blocks] = block;
+
+	/* Note that block==NULL signifies either an error or change
+	buffering. */
+
+	if (err != DB_SUCCESS) {
+		ut_ad(block == NULL);
+		if (err == DB_DECRYPTION_FAILED) {
+			ib_push_warning((void *)NULL,
+				DB_DECRYPTION_FAILED,
+				"Table %s is encrypted but encryption service or"
+				" used key_id is not available. "
+				" Can't continue reading table.",
+				index->table->name.m_name);
+			index->table->file_unreadable = true;
+		}
+
+		goto func_exit;
+	}
+
+	if (block == NULL) {
+		/* This must be a search to perform an insert/delete
+		mark/ delete; try using the insert/delete buffer */
+
+		ut_ad(height == 0);
+		ut_ad(cursor->thr);
+
+		switch (btr_op) {
+		case BTR_INSERT_OP:
+		case BTR_INSERT_IGNORE_UNIQUE_OP:
+			ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
+			ut_ad(!dict_index_is_spatial(index));
+
+			if (ibuf_insert(IBUF_OP_INSERT, tuple, index,
+					page_id, zip_size, cursor->thr)) {
+
+				cursor->flag = BTR_CUR_INSERT_TO_IBUF;
+
+				goto func_exit;
+			}
+			break;
+
+		case BTR_DELMARK_OP:
+			ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
+			ut_ad(!dict_index_is_spatial(index));
+
+			if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
+					index, page_id, zip_size,
+					cursor->thr)) {
+
+				cursor->flag = BTR_CUR_DEL_MARK_IBUF;
+
+				goto func_exit;
+			}
+
+			break;
+
+		case BTR_DELETE_OP:
+			ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
+			ut_ad(!dict_index_is_spatial(index));
+
+			if (!row_purge_poss_sec(cursor->purge_node,
+						index, tuple)) {
+
+				/* The record cannot be purged yet. */
+				cursor->flag = BTR_CUR_DELETE_REF;
+			} else if (ibuf_insert(IBUF_OP_DELETE, tuple,
+					       index, page_id, zip_size,
+					       cursor->thr)) {
+
+				/* The purge was buffered. */
+				cursor->flag = BTR_CUR_DELETE_IBUF;
+			} else {
+				/* The purge could not be buffered. */
+				buf_pool.watch_unset(page_id);
+				break;
+			}
+
+			buf_pool.watch_unset(page_id);
+			goto func_exit;
+
+		default:
+			ut_error;
+		}
+
+		/* Insert to the insert/delete buffer did not succeed, we
+		must read the page from disk. */
+
+		buf_mode = BUF_GET;
+
+		goto retry_page_get;
+	}
+
+	if (retrying_for_search_prev && height != 0) {
+		/* also latch left sibling */
+		uint32_t	left_page_no;
+		buf_block_t*	get_block;
+
+		ut_ad(rw_latch == RW_NO_LATCH);
+
+		rw_latch = upper_rw_latch;
+
+		rw_lock_s_lock(&block->lock);
+		left_page_no = btr_page_get_prev(buf_block_get_frame(block));
+		rw_lock_s_unlock(&block->lock);
+
+		if (left_page_no != FIL_NULL) {
+			ut_ad(prev_n_blocks < leftmost_from_level);
+
+			prev_tree_savepoints[prev_n_blocks]
+				= mtr_set_savepoint(mtr);
+			get_block = buf_page_get_gen(
+				page_id_t(page_id.space(), left_page_no),
+				zip_size, rw_latch, NULL, buf_mode,
+				file, line, mtr, &err);
+			prev_tree_blocks[prev_n_blocks] = get_block;
+			prev_n_blocks++;
+
+			if (err != DB_SUCCESS) {
+				if (err == DB_DECRYPTION_FAILED) {
+					ib_push_warning((void *)NULL,
+						DB_DECRYPTION_FAILED,
+						"Table %s is encrypted but encryption service or"
+						" used key_id is not available. "
+						" Can't continue reading table.",
+						index->table->name.m_name);
+					index->table->file_unreadable = true;
+				}
+
+				goto func_exit;
+			}
+
+			/* BTR_MODIFY_TREE doesn't update prev/next_page_no,
+			without their parent page's lock. So, not needed to
+			retry here, because we have the parent page's lock. */
+		}
+
+		/* release RW_NO_LATCH page and lock with RW_S_LATCH */
+		mtr_release_block_at_savepoint(
+			mtr, tree_savepoints[n_blocks],
+			tree_blocks[n_blocks]);
+
+		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
+		block = buf_page_get_gen(page_id, zip_size,
+					 rw_latch, NULL, buf_mode,
+					 file, line, mtr, &err);
+		tree_blocks[n_blocks] = block;
+
+		if (err != DB_SUCCESS) {
+			if (err == DB_DECRYPTION_FAILED) {
+				ib_push_warning((void *)NULL,
+					DB_DECRYPTION_FAILED,
+					"Table %s is encrypted but encryption service or"
+					" used key_id is not available. "
+					" Can't continue reading table.",
+					index->table->name.m_name);
+				index->table->file_unreadable = true;
+			}
+
+			goto func_exit;
+		}
+	}
+
+	page = buf_block_get_frame(block);
+
+	if (height == ULINT_UNDEFINED
+	    && page_is_leaf(page)
+	    && rw_latch != RW_NO_LATCH
+	    && rw_latch != root_leaf_rw_latch) {
+		/* The root page is also a leaf page (root_leaf).
+		We should reacquire the page, because the root page
+		is latched differently from leaf pages. */
+		ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
+		ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_SX_LATCH);
+		ut_ad(rw_latch == RW_S_LATCH || modify_external || autoinc);
+		ut_ad(!autoinc || root_leaf_rw_latch == RW_X_LATCH);
+
+		ut_ad(n_blocks == 0);
+		mtr_release_block_at_savepoint(
+			mtr, tree_savepoints[n_blocks],
+			tree_blocks[n_blocks]);
+
+		upper_rw_latch = root_leaf_rw_latch;
+		goto search_loop;
+	}
+
+	if (rw_latch != RW_NO_LATCH) {
+#ifdef UNIV_ZIP_DEBUG
+		const page_zip_des_t*	page_zip
+			= buf_block_get_page_zip(block);
+		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+		buf_block_dbg_add_level(
+			block, dict_index_is_ibuf(index)
+			? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
+	}
+
+	ut_ad(fil_page_index_page_check(page));
+	ut_ad(index->id == btr_page_get_index_id(page));
+
+	if (height == ULINT_UNDEFINED) {
+		/* We are in the root node */
+
+		height = btr_page_get_level(page);
+		root_height = height;
+		cursor->tree_height = root_height + 1;
+
+		if (dict_index_is_spatial(index)) {
+			ut_ad(cursor->rtr_info);
+
+			/* If SSN in memory is not initialized, fetch
+			it from root page */
+			if (!rtr_get_current_ssn_id(index)) {
+				/* FIXME: do this in dict_load_table_one() */
+				index->set_ssn(page_get_ssn_id(page) + 1);
+			}
+
+			/* Save the MBR */
+			cursor->rtr_info->thr = cursor->thr;
+			rtr_get_mbr_from_tuple(tuple, &cursor->rtr_info->mbr);
+		}
+
+#ifdef BTR_CUR_ADAPT
+		info->root_guess = block;
+#endif
+	}
+
+	if (height == 0) {
+		if (rw_latch == RW_NO_LATCH) {
+			latch_leaves = btr_cur_latch_leaves(
+				block, latch_mode, cursor, mtr);
+		}
+
+		switch (latch_mode) {
+		case BTR_MODIFY_TREE:
+		case BTR_CONT_MODIFY_TREE:
+		case BTR_CONT_SEARCH_TREE:
+			break;
+		default:
+			if (!s_latch_by_caller
+			    && !srv_read_only_mode
+			    && !modify_external) {
+				/* Release the tree s-latch */
+				/* NOTE: BTR_MODIFY_EXTERNAL
+				needs to keep tree sx-latch */
+				mtr_release_s_latch_at_savepoint(
+					mtr, savepoint,
+					dict_index_get_lock(index));
+			}
+
+			/* release upper blocks */
+			if (retrying_for_search_prev) {
+				ut_ad(!autoinc);
+				for (;
+				     prev_n_releases < prev_n_blocks;
+				     prev_n_releases++) {
+					mtr_release_block_at_savepoint(
+						mtr,
+						prev_tree_savepoints[
+							prev_n_releases],
+						prev_tree_blocks[
+							prev_n_releases]);
+				}
+			}
+
+			for (; n_releases < n_blocks; n_releases++) {
+				if (n_releases == 0
+				    && (modify_external || autoinc)) {
+					/* keep the root page latch */
+					ut_ad(mtr->memo_contains_flagged(
+						      tree_blocks[n_releases],
+						      MTR_MEMO_PAGE_SX_FIX
+						      | MTR_MEMO_PAGE_X_FIX));
+					continue;
+				}
+
+				mtr_release_block_at_savepoint(
+					mtr, tree_savepoints[n_releases],
+					tree_blocks[n_releases]);
+			}
+		}
+
+		page_mode = mode;
+	}
+
+	if (dict_index_is_spatial(index)) {
+		/* Remember the page search mode */
+		search_mode = page_mode;
+
+		/* Some adjustment on search mode, when the
+		page search mode is PAGE_CUR_RTREE_LOCATE
+		or PAGE_CUR_RTREE_INSERT, as we are searching
+		with MBRs. When it is not the target level, we
+		should search all sub-trees that "CONTAIN" the
+		search range/MBR. When it is at the target
+		level, the search becomes PAGE_CUR_LE */
+		if (page_mode == PAGE_CUR_RTREE_LOCATE
+		    && level == height) {
+			if (level == 0) {
+				page_mode = PAGE_CUR_LE;
+			} else {
+				page_mode = PAGE_CUR_RTREE_GET_FATHER;
+			}
+		}
+
+		if (page_mode == PAGE_CUR_RTREE_INSERT) {
+			page_mode = (level == height)
+					? PAGE_CUR_LE
+					: PAGE_CUR_RTREE_INSERT;
+
+			ut_ad(!page_is_leaf(page) || page_mode == PAGE_CUR_LE);
+		}
+
+		/* "need_path" indicates if we need to tracking the parent
+		pages, if it is not spatial comparison, then no need to
+		track it */
+		if (page_mode < PAGE_CUR_CONTAIN) {
+			need_path = false;
+		}
+
+		up_match = 0;
+		low_match = 0;
+
+		if (latch_mode == BTR_MODIFY_TREE
+		    || latch_mode == BTR_CONT_MODIFY_TREE
+		    || latch_mode == BTR_CONT_SEARCH_TREE) {
+			/* Tree are locked, no need for Page Lock to protect
+			the "path" */
+			cursor->rtr_info->need_page_lock = false;
+		}
+        }
+
+	if (dict_index_is_spatial(index) && page_mode >= PAGE_CUR_CONTAIN) {
+		ut_ad(need_path);
+		found = rtr_cur_search_with_match(
+			block, index, tuple, page_mode, page_cursor,
+			cursor->rtr_info);
+
+		/* Need to use BTR_MODIFY_TREE to do the MBR adjustment */
+		if (search_mode == PAGE_CUR_RTREE_INSERT
+		    && cursor->rtr_info->mbr_adj) {
+			if (latch_mode & BTR_MODIFY_LEAF) {
+				/* Parent MBR needs updated, should retry
+				with BTR_MODIFY_TREE */
+				goto func_exit;
+			} else if (latch_mode & BTR_MODIFY_TREE) {
+				rtree_parent_modified = true;
+				cursor->rtr_info->mbr_adj = false;
+				mbr_adj = true;
+			} else {
+				ut_ad(0);
+			}
+		}
+
+		if (found && page_mode == PAGE_CUR_RTREE_GET_FATHER) {
+			cursor->low_match =
+				DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1;
+		}
+#ifdef BTR_CUR_HASH_ADAPT
+	} else if (height == 0 && btr_search_enabled
+		   && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG)
+		   && !dict_index_is_spatial(index)) {
+		/* The adaptive hash index is only used when searching
+		for leaf pages (height==0), but not in r-trees.
+		We only need the byte prefix comparison for the purpose
+		of updating the adaptive hash index. */
+		page_cur_search_with_match_bytes(
+			block, index, tuple, page_mode, &up_match, &up_bytes,
+			&low_match, &low_bytes, page_cursor);
+#endif /* BTR_CUR_HASH_ADAPT */
+	} else {
+		/* Search for complete index fields. */
+		up_bytes = low_bytes = 0;
+		page_cur_search_with_match(
+			block, index, tuple, page_mode, &up_match,
+			&low_match, page_cursor,
+			need_path ? cursor->rtr_info : NULL);
+	}
+
+	if (estimate) {
+		btr_cur_add_path_info(cursor, height, root_height);
+	}
+
+	/* If this is the desired level, leave the loop */
+
+	ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor)));
+
+	/* Add Predicate lock if it is serializable isolation
+	and only if it is in the search case */
+	if (dict_index_is_spatial(index)
+	    && cursor->rtr_info->need_prdt_lock
+	    && mode != PAGE_CUR_RTREE_INSERT
+	    && mode != PAGE_CUR_RTREE_LOCATE
+	    && mode >= PAGE_CUR_CONTAIN) {
+		trx_t*		trx = thr_get_trx(cursor->thr);
+		lock_prdt_t	prdt;
+
+		lock_mutex_enter();
+		lock_init_prdt_from_mbr(
+			&prdt, &cursor->rtr_info->mbr, mode,
+			trx->lock.lock_heap);
+		lock_mutex_exit();
+
+		if (rw_latch == RW_NO_LATCH && height != 0) {
+			rw_lock_s_lock(&(block->lock));
+		}
+
+		lock_prdt_lock(block, &prdt, index, LOCK_S,
+			       LOCK_PREDICATE, cursor->thr);
+
+		if (rw_latch == RW_NO_LATCH && height != 0) {
+			rw_lock_s_unlock(&(block->lock));
+		}
+	}
+
+	if (level != height) {
+
+		const rec_t*	node_ptr;
+		ut_ad(height > 0);
+
+		height--;
+		guess = NULL;
+
+		node_ptr = page_cur_get_rec(page_cursor);
+
+		offsets = rec_get_offsets(node_ptr, index, offsets, 0,
+					  ULINT_UNDEFINED, &heap);
+
+		/* If the rec is the first or last in the page for
+		pessimistic delete intention, it might cause node_ptr insert
+		for the upper level. We should change the intention and retry.
+		*/
+		if (latch_mode == BTR_MODIFY_TREE
+		    && btr_cur_need_opposite_intention(
+			page, lock_intention, node_ptr)) {
+
+need_opposite_intention:
+			ut_ad(upper_rw_latch == RW_X_LATCH);
+
+			if (n_releases > 0) {
+				/* release root block */
+				mtr_release_block_at_savepoint(
+					mtr, tree_savepoints[0],
+					tree_blocks[0]);
+			}
+
+			/* release all blocks */
+			for (; n_releases <= n_blocks; n_releases++) {
+				mtr_release_block_at_savepoint(
+					mtr, tree_savepoints[n_releases],
+					tree_blocks[n_releases]);
+			}
+
+			lock_intention = BTR_INTENTION_BOTH;
+
+			page_id.set_page_no(index->page);
+			up_match = 0;
+			low_match = 0;
+			height = ULINT_UNDEFINED;
+
+			n_blocks = 0;
+			n_releases = 0;
+
+			goto search_loop;
+		}
+
+		if (dict_index_is_spatial(index)) {
+			if (page_rec_is_supremum(node_ptr)) {
+				cursor->low_match = 0;
+				cursor->up_match = 0;
+				goto func_exit;
+			}
+
+			/* If we are doing insertion or record locating,
+			remember the tree nodes we visited */
+			if (page_mode == PAGE_CUR_RTREE_INSERT
+			    || (search_mode == PAGE_CUR_RTREE_LOCATE
+			        && (latch_mode != BTR_MODIFY_LEAF))) {
+				bool		add_latch = false;
+
+				if (latch_mode == BTR_MODIFY_TREE
+				    && rw_latch == RW_NO_LATCH) {
+					ut_ad(mtr->memo_contains_flagged(
+						&index->lock, MTR_MEMO_X_LOCK
+						| MTR_MEMO_SX_LOCK));
+					rw_lock_s_lock(&block->lock);
+					add_latch = true;
+				}
+
+				/* Store the parent cursor location */
+#ifdef UNIV_DEBUG
+				ulint	num_stored = rtr_store_parent_path(
+					block, cursor, latch_mode,
+					height + 1, mtr);
+#else
+				rtr_store_parent_path(
+					block, cursor, latch_mode,
+					height + 1, mtr);
+#endif
+
+				if (page_mode == PAGE_CUR_RTREE_INSERT) {
+					btr_pcur_t*     r_cursor =
+						rtr_get_parent_cursor(
+							cursor, height + 1,
+							true);
+					/* If it is insertion, there should
+					be only one parent for each level
+					traverse */
+#ifdef UNIV_DEBUG
+					ut_ad(num_stored == 1);
+#endif
+
+					node_ptr = btr_pcur_get_rec(r_cursor);
+
+				}
+
+				if (add_latch) {
+					rw_lock_s_unlock(&block->lock);
+				}
+
+				ut_ad(!page_rec_is_supremum(node_ptr));
+			}
+
+			ut_ad(page_mode == search_mode
+			      || (page_mode == PAGE_CUR_WITHIN
+				  && search_mode == PAGE_CUR_RTREE_LOCATE));
+
+			page_mode = search_mode;
+		}
+
+		/* If the first or the last record of the page
+		or the same key value to the first record or last record,
+		the another page might be chosen when BTR_CONT_MODIFY_TREE.
+		So, the parent page should not released to avoiding deadlock
+		with blocking the another search with the same key value. */
+		if (!detected_same_key_root
+		    && lock_intention == BTR_INTENTION_BOTH
+		    && !dict_index_is_unique(index)
+		    && latch_mode == BTR_MODIFY_TREE
+		    && (up_match >= rec_offs_n_fields(offsets) - 1
+			|| low_match >= rec_offs_n_fields(offsets) - 1)) {
+			const rec_t*	first_rec = page_rec_get_next_const(
+				page_get_infimum_rec(page));
+			ulint		matched_fields;
+
+			ut_ad(upper_rw_latch == RW_X_LATCH);
+
+			if (node_ptr == first_rec
+			    || page_rec_is_last(node_ptr, page)) {
+				detected_same_key_root = true;
+			} else {
+				matched_fields = 0;
+
+				offsets2 = rec_get_offsets(
+					first_rec, index, offsets2,
+					0, ULINT_UNDEFINED, &heap);
+				cmp_rec_rec(node_ptr, first_rec,
+					    offsets, offsets2, index, false,
+					    &matched_fields);
+
+				if (matched_fields
+				    >= rec_offs_n_fields(offsets) - 1) {
+					detected_same_key_root = true;
+				} else {
+					const rec_t*	last_rec;
+
+					last_rec = page_rec_get_prev_const(
+						page_get_supremum_rec(page));
+
+					matched_fields = 0;
+
+					offsets2 = rec_get_offsets(
+						last_rec, index, offsets2,
+						0, ULINT_UNDEFINED, &heap);
+					cmp_rec_rec(
+						node_ptr, last_rec,
+						offsets, offsets2, index,
+						false, &matched_fields);
+					if (matched_fields
+					    >= rec_offs_n_fields(offsets) - 1) {
+						detected_same_key_root = true;
+					}
+				}
+			}
+		}
+
+		/* If the page might cause modify_tree,
+		we should not release the parent page's lock. */
+		if (!detected_same_key_root
+		    && latch_mode == BTR_MODIFY_TREE
+		    && !btr_cur_will_modify_tree(
+				index, page, lock_intention, node_ptr,
+				node_ptr_max_size, zip_size, mtr)
+		    && !rtree_parent_modified) {
+			ut_ad(upper_rw_latch == RW_X_LATCH);
+			ut_ad(n_releases <= n_blocks);
+
+			/* we can release upper blocks */
+			for (; n_releases < n_blocks; n_releases++) {
+				if (n_releases == 0) {
+					/* we should not release root page
+					to pin to same block. */
+					continue;
+				}
+
+				/* release unused blocks to unpin */
+				mtr_release_block_at_savepoint(
+					mtr, tree_savepoints[n_releases],
+					tree_blocks[n_releases]);
+			}
+		}
+
+		if (height == level
+		    && latch_mode == BTR_MODIFY_TREE) {
+			ut_ad(upper_rw_latch == RW_X_LATCH);
+			/* we should sx-latch root page, if released already.
+			It contains seg_header. */
+			if (n_releases > 0) {
+				mtr_block_sx_latch_at_savepoint(
+					mtr, tree_savepoints[0],
+					tree_blocks[0]);
+			}
+
+			/* x-latch the branch blocks not released yet. */
+			for (ulint i = n_releases; i <= n_blocks; i++) {
+				mtr_block_x_latch_at_savepoint(
+					mtr, tree_savepoints[i],
+					tree_blocks[i]);
+			}
+		}
+
+		/* We should consider prev_page of parent page, if the node_ptr
+		is the leftmost of the page. because BTR_SEARCH_PREV and
+		BTR_MODIFY_PREV latches prev_page of the leaf page. */
+		if ((latch_mode == BTR_SEARCH_PREV
+		     || latch_mode == BTR_MODIFY_PREV)
+		    && !retrying_for_search_prev) {
+			/* block should be latched for consistent
+			   btr_page_get_prev() */
+			ut_ad(mtr->memo_contains_flagged(
+				      block, MTR_MEMO_PAGE_S_FIX
+				      | MTR_MEMO_PAGE_X_FIX));
+
+			if (page_has_prev(page)
+			    && page_rec_is_first(node_ptr, page)) {
+
+				if (leftmost_from_level == 0) {
+					leftmost_from_level = height + 1;
+				}
+			} else {
+				leftmost_from_level = 0;
+			}
+
+			if (height == 0 && leftmost_from_level > 0) {
+				/* should retry to get also prev_page
+				from level==leftmost_from_level. */
+				retrying_for_search_prev = true;
+
+				prev_tree_blocks = static_cast<buf_block_t**>(
+					ut_malloc_nokey(sizeof(buf_block_t*)
+							* leftmost_from_level));
+
+				prev_tree_savepoints = static_cast<ulint*>(
+					ut_malloc_nokey(sizeof(ulint)
+							* leftmost_from_level));
+
+				/* back to the level (leftmost_from_level+1) */
+				ulint	idx = n_blocks
+					- (leftmost_from_level - 1);
+
+				page_id.set_page_no(
+					tree_blocks[idx]->page.id().page_no());
+
+				for (ulint i = n_blocks
+					       - (leftmost_from_level - 1);
+				     i <= n_blocks; i++) {
+					mtr_release_block_at_savepoint(
+						mtr, tree_savepoints[i],
+						tree_blocks[i]);
+				}
+
+				n_blocks -= (leftmost_from_level - 1);
+				height = leftmost_from_level;
+				ut_ad(n_releases == 0);
+
+				/* replay up_match, low_match */
+				up_match = 0;
+				low_match = 0;
+				rtr_info_t*	rtr_info	= need_path
+					? cursor->rtr_info : NULL;
+
+				for (ulint i = 0; i < n_blocks; i++) {
+					page_cur_search_with_match(
+						tree_blocks[i], index, tuple,
+						page_mode, &up_match,
+						&low_match, page_cursor,
+						rtr_info);
+				}
+
+				goto search_loop;
+			}
+		}
+
+		/* Go to the child node */
+		page_id.set_page_no(
+			btr_node_ptr_get_child_page_no(node_ptr, offsets));
+
+		n_blocks++;
+
+		if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) {
+			/* We're doing a search on an ibuf tree and we're one
+			level above the leaf page. */
+
+			ut_ad(level == 0);
+
+			buf_mode = BUF_GET;
+			rw_latch = RW_NO_LATCH;
+			goto retry_page_get;
+		}
+
+		if (dict_index_is_spatial(index)
+		    && page_mode >= PAGE_CUR_CONTAIN
+		    && page_mode != PAGE_CUR_RTREE_INSERT) {
+			ut_ad(need_path);
+			rtr_node_path_t* path =
+				cursor->rtr_info->path;
+
+			if (!path->empty() && found) {
+				ut_ad(path->back().page_no
+				      == page_id.page_no());
+				path->pop_back();
+#ifdef UNIV_DEBUG
+				if (page_mode == PAGE_CUR_RTREE_LOCATE
+				    && (latch_mode != BTR_MODIFY_LEAF)) {
+					btr_pcur_t*	cur
+					= cursor->rtr_info->parent_path->back(
+					  ).cursor;
+					rec_t*	my_node_ptr
+						= btr_pcur_get_rec(cur);
+
+					offsets = rec_get_offsets(
+						my_node_ptr, index, offsets,
+						0, ULINT_UNDEFINED, &heap);
+
+					ulint	my_page_no
+					= btr_node_ptr_get_child_page_no(
+						my_node_ptr, offsets);
+
+					ut_ad(page_id.page_no() == my_page_no);
+				}
+#endif
+			}
+		}
+
+		goto search_loop;
+	} else if (!dict_index_is_spatial(index)
+		   && latch_mode == BTR_MODIFY_TREE
+		   && lock_intention == BTR_INTENTION_INSERT
+		   && page_has_next(page)
+		   && page_rec_is_last(page_cur_get_rec(page_cursor), page)) {
+
+		/* btr_insert_into_right_sibling() might cause
+		deleting node_ptr at upper level */
+
+		guess = NULL;
+
+		if (height == 0) {
+			/* release the leaf pages if latched */
+			for (uint i = 0; i < 3; i++) {
+				if (latch_leaves.blocks[i] != NULL) {
+					mtr_release_block_at_savepoint(
+						mtr, latch_leaves.savepoints[i],
+						latch_leaves.blocks[i]);
+					latch_leaves.blocks[i] = NULL;
+				}
+			}
+		}
+
+		goto need_opposite_intention;
+	}
+
+	if (level != 0) {
+		ut_ad(!autoinc);
+
+		if (upper_rw_latch == RW_NO_LATCH) {
+			ut_ad(latch_mode == BTR_CONT_MODIFY_TREE
+			      || latch_mode == BTR_CONT_SEARCH_TREE);
+			buf_block_t* child_block = btr_block_get(
+				*index, page_id.page_no(),
+				latch_mode == BTR_CONT_MODIFY_TREE
+				? RW_X_LATCH : RW_SX_LATCH, false, mtr);
+			btr_assert_not_corrupted(child_block, index);
+		} else {
+			ut_ad(mtr->memo_contains_flagged(block,
+							 upper_rw_latch));
+			btr_assert_not_corrupted(block, index);
+
+			if (s_latch_by_caller) {
+				ut_ad(latch_mode == BTR_SEARCH_TREE);
+				/* to exclude modifying tree operations
+				should sx-latch the index. */
+				ut_ad(mtr->memo_contains(index->lock,
+							 MTR_MEMO_SX_LOCK));
+				/* because has sx-latch of index,
+				can release upper blocks. */
+				for (; n_releases < n_blocks; n_releases++) {
+					mtr_release_block_at_savepoint(
+						mtr,
+						tree_savepoints[n_releases],
+						tree_blocks[n_releases]);
+				}
+			}
+		}
+
+		if (page_mode <= PAGE_CUR_LE) {
+			cursor->low_match = low_match;
+			cursor->up_match = up_match;
+		}
+	} else {
+		cursor->low_match = low_match;
+		cursor->low_bytes = low_bytes;
+		cursor->up_match = up_match;
+		cursor->up_bytes = up_bytes;
+
+		if (autoinc) {
+			page_set_autoinc(tree_blocks[0], autoinc, mtr, false);
+		}
+
+#ifdef BTR_CUR_HASH_ADAPT
+		/* We do a dirty read of btr_search_enabled here.  We
+		will properly check btr_search_enabled again in
+		btr_search_build_page_hash_index() before building a
+		page hash index, while holding search latch. */
+		if (!btr_search_enabled) {
+# ifdef MYSQL_INDEX_DISABLE_AHI
+		} else if (index->disable_ahi) {
+# endif
+		} else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG) {
+			ut_ad(index->is_instant());
+			/* This may be a search tuple for
+			btr_pcur_restore_position(). */
+			ut_ad(tuple->is_metadata()
+			      || (tuple->is_metadata(tuple->info_bits
+						     ^ REC_STATUS_INSTANT)));
+		} else if (rec_is_metadata(btr_cur_get_rec(cursor), *index)) {
+			/* Only user records belong in the adaptive
+			hash index. */
+		} else {
+			btr_search_info_update(index, cursor);
+		}
+#endif /* BTR_CUR_HASH_ADAPT */
+		ut_ad(cursor->up_match != ULINT_UNDEFINED
+		      || mode != PAGE_CUR_GE);
+		ut_ad(cursor->up_match != ULINT_UNDEFINED
+		      || mode != PAGE_CUR_LE);
+		ut_ad(cursor->low_match != ULINT_UNDEFINED
+		      || mode != PAGE_CUR_LE);
+	}
+
+	/* For spatial index, remember  what blocks are still latched */
+	if (dict_index_is_spatial(index)
+	    && (latch_mode == BTR_MODIFY_TREE
+		|| latch_mode == BTR_MODIFY_LEAF)) {
+		for (ulint i = 0; i < n_releases; i++) {
+			cursor->rtr_info->tree_blocks[i] = NULL;
+			cursor->rtr_info->tree_savepoints[i] = 0;
+		}
+
+		for (ulint i = n_releases; i <= n_blocks; i++) {
+			cursor->rtr_info->tree_blocks[i] = tree_blocks[i];
+			cursor->rtr_info->tree_savepoints[i] = tree_savepoints[i];
+		}
+	}
+
+func_exit:
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	if (retrying_for_search_prev) {
+		ut_free(prev_tree_blocks);
+		ut_free(prev_tree_savepoints);
+	}
+
+	if (mbr_adj) {
+		/* remember that we will need to adjust parent MBR */
+		cursor->rtr_info->mbr_adj = true;
+	}
+
+#ifdef BTR_CUR_HASH_ADAPT
+	if (ahi_latch) {
+		rw_lock_s_lock(ahi_latch);
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	DBUG_RETURN(err);
+}
+
+/*****************************************************************//**
+Opens a cursor at either end of an index. */
+dberr_t
+btr_cur_open_at_index_side_func(
+/*============================*/
+	bool		from_left,	/*!< in: true if open to the low end,
+					false if to the high end */
+	dict_index_t*	index,		/*!< in: index */
+	ulint		latch_mode,	/*!< in: latch mode */
+	btr_cur_t*	cursor,		/*!< in/out: cursor */
+	ulint		level,		/*!< in: level to search for
+					(0=leaf). */
+	const char*	file,		/*!< in: file name */
+	unsigned	line,		/*!< in: line where called */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	page_cur_t*	page_cursor;
+	ulint		node_ptr_max_size = srv_page_size / 2;
+	ulint		height;
+	ulint		root_height = 0; /* remove warning */
+	rec_t*		node_ptr;
+	ulint		estimate;
+	btr_intention_t	lock_intention;
+	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
+	ulint		tree_savepoints[BTR_MAX_LEVELS];
+	ulint		n_blocks = 0;
+	ulint		n_releases = 0;
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	dberr_t		err = DB_SUCCESS;
+
+	rec_offs_init(offsets_);
+
+	estimate = latch_mode & BTR_ESTIMATE;
+	latch_mode &= ulint(~BTR_ESTIMATE);
+
+	ut_ad(level != ULINT_UNDEFINED);
+
+	bool	s_latch_by_caller;
+
+	s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
+	latch_mode &= ulint(~BTR_ALREADY_S_LATCHED);
+
+	lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
+
+	ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
+
+	/* This function doesn't need to lock left page of the leaf page */
+	if (latch_mode == BTR_SEARCH_PREV) {
+		latch_mode = BTR_SEARCH_LEAF;
+	} else if (latch_mode == BTR_MODIFY_PREV) {
+		latch_mode = BTR_MODIFY_LEAF;
+	}
+
+	/* Store the position of the tree latch we push to mtr so that we
+	know how to release it when we have latched the leaf node */
+
+	ulint savepoint = mtr_set_savepoint(mtr);
+
+	rw_lock_type_t upper_rw_latch;
+
+	switch (latch_mode) {
+	case BTR_CONT_MODIFY_TREE:
+	case BTR_CONT_SEARCH_TREE:
+		upper_rw_latch = RW_NO_LATCH;
+		break;
+	case BTR_MODIFY_TREE:
+		/* Most of delete-intended operations are purging.
+		Free blocks and read IO bandwidth should be prior
+		for them, when the history list is glowing huge. */
+		if (lock_intention == BTR_INTENTION_DELETE
+		    && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
+		    && buf_pool.n_pend_reads) {
+			mtr_x_lock_index(index, mtr);
+		} else {
+			mtr_sx_lock_index(index, mtr);
+		}
+		upper_rw_latch = RW_X_LATCH;
+		break;
+	default:
+		ut_ad(!s_latch_by_caller
+		      || mtr->memo_contains_flagged(&index->lock,
+						    MTR_MEMO_SX_LOCK
+						    | MTR_MEMO_S_LOCK));
+		if (!srv_read_only_mode) {
+			if (!s_latch_by_caller) {
+				/* BTR_SEARCH_TREE is intended to be used with
+				BTR_ALREADY_S_LATCHED */
+				ut_ad(latch_mode != BTR_SEARCH_TREE);
+
+				mtr_s_lock_index(index, mtr);
+			}
+			upper_rw_latch = RW_S_LATCH;
+		} else {
+			upper_rw_latch = RW_NO_LATCH;
+		}
+	}
+
+	const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf(
+		latch_mode);
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+	cursor->index = index;
+
+	page_id_t		page_id(index->table->space_id, index->page);
+	const ulint		zip_size = index->table->space->zip_size();
+
+	if (root_leaf_rw_latch == RW_X_LATCH) {
+		node_ptr_max_size = btr_node_ptr_max_size(index);
+	}
+
+	height = ULINT_UNDEFINED;
+
+	for (;;) {
+		ut_ad(n_blocks < BTR_MAX_LEVELS);
+		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
+
+		const ulint rw_latch = height
+			&& (latch_mode != BTR_MODIFY_TREE || height == level)
+			? upper_rw_latch : RW_NO_LATCH;
+		buf_block_t* block = buf_page_get_gen(page_id, zip_size,
+						      rw_latch, NULL, BUF_GET,
+						      file, line, mtr, &err,
+						      height == 0
+						      && !index->is_clust());
+		ut_ad((block != NULL) == (err == DB_SUCCESS));
+		tree_blocks[n_blocks] = block;
+
+		if (err != DB_SUCCESS) {
+			if (err == DB_DECRYPTION_FAILED) {
+				ib_push_warning((void *)NULL,
+					DB_DECRYPTION_FAILED,
+					"Table %s is encrypted but encryption service or"
+					" used key_id is not available. "
+					" Can't continue reading table.",
+					index->table->name.m_name);
+				index->table->file_unreadable = true;
+			}
+
+			goto exit_loop;
+		}
+
+		const page_t* page = buf_block_get_frame(block);
+
+		if (height == ULINT_UNDEFINED
+		    && page_is_leaf(page)
+		    && rw_latch != RW_NO_LATCH
+		    && rw_latch != root_leaf_rw_latch) {
+			/* We should retry to get the page, because the root page
+			is latched with different level as a leaf page. */
+			ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
+			ut_ad(rw_latch == RW_S_LATCH);
+
+			ut_ad(n_blocks == 0);
+			mtr_release_block_at_savepoint(
+				mtr, tree_savepoints[n_blocks],
+				tree_blocks[n_blocks]);
+
+			upper_rw_latch = root_leaf_rw_latch;
+			continue;
+		}
+
+		ut_ad(fil_page_index_page_check(page));
+		ut_ad(index->id == btr_page_get_index_id(page));
+
+		if (height == ULINT_UNDEFINED) {
+			/* We are in the root node */
+
+			height = btr_page_get_level(page);
+			root_height = height;
+			ut_a(height >= level);
+		} else {
+			/* TODO: flag the index corrupted if this fails */
+			ut_ad(height == btr_page_get_level(page));
+		}
+
+		if (height == 0) {
+			if (rw_latch == RW_NO_LATCH) {
+				btr_cur_latch_leaves(block, latch_mode,
+						     cursor, mtr);
+			}
+
+			/* In versions <= 3.23.52 we had forgotten to
+			release the tree latch here. If in an index
+			scan we had to scan far to find a record
+			visible to the current transaction, that could
+			starve others waiting for the tree latch. */
+
+			switch (latch_mode) {
+			case BTR_MODIFY_TREE:
+			case BTR_CONT_MODIFY_TREE:
+			case BTR_CONT_SEARCH_TREE:
+				break;
+			default:
+				if (UNIV_UNLIKELY(srv_read_only_mode)) {
+					break;
+				}
+				if (!s_latch_by_caller) {
+					/* Release the tree s-latch */
+					mtr_release_s_latch_at_savepoint(
+						mtr, savepoint, &index->lock);
+				}
+
+				/* release upper blocks */
+				for (; n_releases < n_blocks; n_releases++) {
+					mtr_release_block_at_savepoint(
+						mtr,
+						tree_savepoints[n_releases],
+						tree_blocks[n_releases]);
+				}
+			}
+		} else if (height == level /* height != 0 */
+			   && UNIV_LIKELY(!srv_read_only_mode)) {
+			/* We already have the block latched. */
+			ut_ad(latch_mode == BTR_SEARCH_TREE);
+			ut_ad(s_latch_by_caller);
+			ut_ad(upper_rw_latch == RW_S_LATCH);
+			ut_ad(mtr->memo_contains_flagged(block,
+							 MTR_MEMO_PAGE_S_FIX));
+
+			if (s_latch_by_caller) {
+				/* to exclude modifying tree operations
+				should sx-latch the index. */
+				ut_ad(mtr->memo_contains(index->lock,
+							 MTR_MEMO_SX_LOCK));
+				/* because has sx-latch of index,
+				can release upper blocks. */
+				for (; n_releases < n_blocks; n_releases++) {
+					mtr_release_block_at_savepoint(
+						mtr,
+						tree_savepoints[n_releases],
+						tree_blocks[n_releases]);
+				}
+			}
+		}
+
+		if (from_left) {
+			page_cur_set_before_first(block, page_cursor);
+		} else {
+			page_cur_set_after_last(block, page_cursor);
+		}
+
+		if (height == level) {
+			if (estimate) {
+				btr_cur_add_path_info(cursor, height,
+						      root_height);
+			}
+
+			break;
+		}
+
+		ut_ad(height > 0);
+
+		if (from_left) {
+			page_cur_move_to_next(page_cursor);
+		} else {
+			page_cur_move_to_prev(page_cursor);
+		}
+
+		if (estimate) {
+			btr_cur_add_path_info(cursor, height, root_height);
+		}
+
+		height--;
+
+		node_ptr = page_cur_get_rec(page_cursor);
+		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
+					  0, ULINT_UNDEFINED, &heap);
+
+		/* If the rec is the first or last in the page for
+		pessimistic delete intention, it might cause node_ptr insert
+		for the upper level. We should change the intention and retry.
+		*/
+		if (latch_mode == BTR_MODIFY_TREE
+		    && btr_cur_need_opposite_intention(
+			page, lock_intention, node_ptr)) {
+
+			ut_ad(upper_rw_latch == RW_X_LATCH);
+			/* release all blocks */
+			for (; n_releases <= n_blocks; n_releases++) {
+				mtr_release_block_at_savepoint(
+					mtr, tree_savepoints[n_releases],
+					tree_blocks[n_releases]);
+			}
+
+			lock_intention = BTR_INTENTION_BOTH;
+
+			page_id.set_page_no(dict_index_get_page(index));
+
+			height = ULINT_UNDEFINED;
+
+			n_blocks = 0;
+			n_releases = 0;
+
+			continue;
+		}
+
+		if (latch_mode == BTR_MODIFY_TREE
+		    && !btr_cur_will_modify_tree(
+				cursor->index, page, lock_intention, node_ptr,
+				node_ptr_max_size, zip_size, mtr)) {
+			ut_ad(upper_rw_latch == RW_X_LATCH);
+			ut_ad(n_releases <= n_blocks);
+
+			/* we can release upper blocks */
+			for (; n_releases < n_blocks; n_releases++) {
+				if (n_releases == 0) {
+					/* we should not release root page
+					to pin to same block. */
+					continue;
+				}
+
+				/* release unused blocks to unpin */
+				mtr_release_block_at_savepoint(
+					mtr, tree_savepoints[n_releases],
+					tree_blocks[n_releases]);
+			}
+		}
+
+		if (height == level
+		    && latch_mode == BTR_MODIFY_TREE) {
+			ut_ad(upper_rw_latch == RW_X_LATCH);
+			/* we should sx-latch root page, if released already.
+			It contains seg_header. */
+			if (n_releases > 0) {
+				mtr_block_sx_latch_at_savepoint(
+					mtr, tree_savepoints[0],
+					tree_blocks[0]);
+			}
+
+			/* x-latch the branch blocks not released yet. */
+			for (ulint i = n_releases; i <= n_blocks; i++) {
+				mtr_block_x_latch_at_savepoint(
+					mtr, tree_savepoints[i],
+					tree_blocks[i]);
+			}
+		}
+
+		/* Go to the child node */
+		page_id.set_page_no(
+			btr_node_ptr_get_child_page_no(node_ptr, offsets));
+
+		n_blocks++;
+	}
+
+ exit_loop:
+	if (heap) {
+		mem_heap_free(heap);
+	}
+
+	return err;
+}
+
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree.
+@return true if the index is available and we have put the cursor, false
+if the index is unavailable */
+bool
+btr_cur_open_at_rnd_pos_func(
+/*=========================*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_cur_t*	cursor,		/*!< in/out: B-tree cursor */
+	const char*	file,		/*!< in: file name */
+	unsigned	line,		/*!< in: line where called */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	page_cur_t*	page_cursor;
+	ulint		node_ptr_max_size = srv_page_size / 2;
+	ulint		height;
+	rec_t*		node_ptr;
+	btr_intention_t	lock_intention;
+	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
+	ulint		tree_savepoints[BTR_MAX_LEVELS];
+	ulint		n_blocks = 0;
+	ulint		n_releases = 0;
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(!index->is_spatial());
+
+	lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
+
+	ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
+
+	ulint savepoint = mtr_set_savepoint(mtr);
+
+	rw_lock_type_t upper_rw_latch;
+
+	switch (latch_mode) {
+	case BTR_MODIFY_TREE:
+		/* Most of delete-intended operations are purging.
+		Free blocks and read IO bandwidth should be prior
+		for them, when the history list is glowing huge. */
+		if (lock_intention == BTR_INTENTION_DELETE
+		    && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
+		    && buf_pool.n_pend_reads) {
+			mtr_x_lock_index(index, mtr);
+		} else {
+			mtr_sx_lock_index(index, mtr);
+		}
+		upper_rw_latch = RW_X_LATCH;
+		break;
+	case BTR_SEARCH_PREV:
+	case BTR_MODIFY_PREV:
+		/* This function doesn't support left uncle
+		   page lock for left leaf page lock, when
+		   needed. */
+	case BTR_SEARCH_TREE:
+	case BTR_CONT_MODIFY_TREE:
+	case BTR_CONT_SEARCH_TREE:
+		ut_ad(0);
+		/* fall through */
+	default:
+		if (!srv_read_only_mode) {
+			mtr_s_lock_index(index, mtr);
+			upper_rw_latch = RW_S_LATCH;
+		} else {
+			upper_rw_latch = RW_NO_LATCH;
+		}
+	}
+
+	DBUG_EXECUTE_IF("test_index_is_unavailable",
+			return(false););
+
+	if (index->page == FIL_NULL) {
+		/* Since we don't hold index lock until just now, the index
+		could be modified by others, for example, if this is a
+		statistics updater for referenced table, it could be marked
+		as unavailable by 'DROP TABLE' in the mean time, since
+		we don't hold lock for statistics updater */
+		return(false);
+	}
+
+	const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf(
+		latch_mode);
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+	cursor->index = index;
+
+	page_id_t		page_id(index->table->space_id, index->page);
+	const ulint		zip_size = index->table->space->zip_size();
+	dberr_t			err = DB_SUCCESS;
+
+	if (root_leaf_rw_latch == RW_X_LATCH) {
+		node_ptr_max_size = btr_node_ptr_max_size(index);
+	}
+
+	height = ULINT_UNDEFINED;
+
+	for (;;) {
+		page_t*		page;
+
+		ut_ad(n_blocks < BTR_MAX_LEVELS);
+		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
+
+		const rw_lock_type_t rw_latch = height
+			&& latch_mode != BTR_MODIFY_TREE
+			? upper_rw_latch : RW_NO_LATCH;
+		buf_block_t* block = buf_page_get_gen(page_id, zip_size,
+						      rw_latch, NULL, BUF_GET,
+						      file, line, mtr, &err,
+						      height == 0
+						      && !index->is_clust());
+		tree_blocks[n_blocks] = block;
+
+		ut_ad((block != NULL) == (err == DB_SUCCESS));
+
+		if (err != DB_SUCCESS) {
+			if (err == DB_DECRYPTION_FAILED) {
+				ib_push_warning((void *)NULL,
+					DB_DECRYPTION_FAILED,
+					"Table %s is encrypted but encryption service or"
+					" used key_id is not available. "
+					" Can't continue reading table.",
+					index->table->name.m_name);
+				index->table->file_unreadable = true;
+			}
+
+			break;
+		}
+
+		page = buf_block_get_frame(block);
+
+		if (height == ULINT_UNDEFINED
+		    && page_is_leaf(page)
+		    && rw_latch != RW_NO_LATCH
+		    && rw_latch != root_leaf_rw_latch) {
+			/* We should retry to get the page, because the root page
+			is latched with different level as a leaf page. */
+			ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
+			ut_ad(rw_latch == RW_S_LATCH);
+
+			ut_ad(n_blocks == 0);
+			mtr_release_block_at_savepoint(
+				mtr, tree_savepoints[n_blocks],
+				tree_blocks[n_blocks]);
+
+			upper_rw_latch = root_leaf_rw_latch;
+			continue;
+		}
+
+		ut_ad(fil_page_index_page_check(page));
+		ut_ad(index->id == btr_page_get_index_id(page));
+
+		if (height == ULINT_UNDEFINED) {
+			/* We are in the root node */
+
+			height = btr_page_get_level(page);
+		}
+
+		if (height == 0) {
+			if (rw_latch == RW_NO_LATCH
+			    || srv_read_only_mode) {
+				btr_cur_latch_leaves(block, latch_mode, cursor,
+						     mtr);
+			}
+
+			/* btr_cur_open_at_index_side_func() and
+			btr_cur_search_to_nth_level() release
+			tree s-latch here.*/
+			switch (latch_mode) {
+			case BTR_MODIFY_TREE:
+			case BTR_CONT_MODIFY_TREE:
+			case BTR_CONT_SEARCH_TREE:
+				break;
+			default:
+				/* Release the tree s-latch */
+				if (!srv_read_only_mode) {
+					mtr_release_s_latch_at_savepoint(
+						mtr, savepoint,
+						dict_index_get_lock(index));
+				}
+
+				/* release upper blocks */
+				for (; n_releases < n_blocks; n_releases++) {
+					mtr_release_block_at_savepoint(
+						mtr,
+						tree_savepoints[n_releases],
+						tree_blocks[n_releases]);
+				}
+			}
+		}
+
+		page_cur_open_on_rnd_user_rec(block, page_cursor);
+
+		if (height == 0) {
+
+			break;
+		}
+
+		ut_ad(height > 0);
+
+		height--;
+
+		node_ptr = page_cur_get_rec(page_cursor);
+		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
+					  0, ULINT_UNDEFINED, &heap);
+
+		/* If the rec is the first or last in the page for
+		pessimistic delete intention, it might cause node_ptr insert
+		for the upper level. We should change the intention and retry.
+		*/
+		if (latch_mode == BTR_MODIFY_TREE
+		    && btr_cur_need_opposite_intention(
+			page, lock_intention, node_ptr)) {
+
+			ut_ad(upper_rw_latch == RW_X_LATCH);
+			/* release all blocks */
+			for (; n_releases <= n_blocks; n_releases++) {
+				mtr_release_block_at_savepoint(
+					mtr, tree_savepoints[n_releases],
+					tree_blocks[n_releases]);
+			}
+
+			lock_intention = BTR_INTENTION_BOTH;
+
+			page_id.set_page_no(dict_index_get_page(index));
+
+			height = ULINT_UNDEFINED;
+
+			n_blocks = 0;
+			n_releases = 0;
+
+			continue;
+		}
+
+		if (latch_mode == BTR_MODIFY_TREE
+		    && !btr_cur_will_modify_tree(
+				cursor->index, page, lock_intention, node_ptr,
+				node_ptr_max_size, zip_size, mtr)) {
+			ut_ad(upper_rw_latch == RW_X_LATCH);
+			ut_ad(n_releases <= n_blocks);
+
+			/* we can release upper blocks */
+			for (; n_releases < n_blocks; n_releases++) {
+				if (n_releases == 0) {
+					/* we should not release root page
+					to pin to same block. */
+					continue;
+				}
+
+				/* release unused blocks to unpin */
+				mtr_release_block_at_savepoint(
+					mtr, tree_savepoints[n_releases],
+					tree_blocks[n_releases]);
+			}
+		}
+
+		if (height == 0
+		    && latch_mode == BTR_MODIFY_TREE) {
+			ut_ad(upper_rw_latch == RW_X_LATCH);
+			/* we should sx-latch root page, if released already.
+			It contains seg_header. */
+			if (n_releases > 0) {
+				mtr_block_sx_latch_at_savepoint(
+					mtr, tree_savepoints[0],
+					tree_blocks[0]);
+			}
+
+			/* x-latch the branch blocks not released yet. */
+			for (ulint i = n_releases; i <= n_blocks; i++) {
+				mtr_block_x_latch_at_savepoint(
+					mtr, tree_savepoints[i],
+					tree_blocks[i]);
+			}
+		}
+
+		/* Go to the child node */
+		page_id.set_page_no(
+			btr_node_ptr_get_child_page_no(node_ptr, offsets));
+
+		n_blocks++;
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return err == DB_SUCCESS;
+}
+
+/*==================== B-TREE INSERT =========================*/
+
+/*************************************************************//**
+Inserts a record if there is enough space, or if enough space can
+be freed by reorganizing. Differs from btr_cur_optimistic_insert because
+no heuristics is applied to whether it pays to use CPU time for
+reorganizing the page or not.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to inserted record if succeed, else NULL */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+rec_t*
+btr_cur_insert_if_possible(
+/*=======================*/
+	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
+				cursor stays valid */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert; the size info need not
+				have been stored to tuple */
+	rec_offs**	offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	page_cur_t*	page_cursor;
+	rec_t*		rec;
+
+	ut_ad(dtuple_check_typed(tuple));
+
+	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+					 MTR_MEMO_PAGE_X_FIX));
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	/* Now, try the insert */
+	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
+				    offsets, heap, n_ext, mtr);
+
+	/* If the record did not fit, reorganize.
+	For compressed pages, page_cur_tuple_insert()
+	attempted this already. */
+	if (!rec && !page_cur_get_page_zip(page_cursor)
+	    && btr_page_reorganize(page_cursor, cursor->index, mtr)) {
+		rec = page_cur_tuple_insert(
+			page_cursor, tuple, cursor->index,
+			offsets, heap, n_ext, mtr);
+	}
+
+	ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
+	return(rec);
+}
+
+/*************************************************************//**
+For an insert, checks the locks and does the undo logging if desired.
+@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
+UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(2,3,5,6)))
+dberr_t
+btr_cur_ins_lock_and_undo(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags: if
+				not zero, the parameters index and thr
+				should be specified */
+	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert */
+	dtuple_t*	entry,	/*!< in/out: entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread or NULL */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	bool*		inherit)/*!< out: true if the inserted new record maybe
+				should inherit LOCK_GAP type locks from the
+				successor record */
+{
+	dict_index_t*	index;
+	dberr_t		err = DB_SUCCESS;
+	rec_t*		rec;
+	roll_ptr_t	roll_ptr;
+
+	/* Check if we have to wait for a lock: enqueue an explicit lock
+	request if yes */
+
+	rec = btr_cur_get_rec(cursor);
+	index = cursor->index;
+
+	ut_ad(!dict_index_is_online_ddl(index)
+	      || dict_index_is_clust(index)
+	      || (flags & BTR_CREATE_FLAG));
+	ut_ad(mtr->is_named_space(index->table->space));
+
+	/* Check if there is predicate or GAP lock preventing the insertion */
+	if (!(flags & BTR_NO_LOCKING_FLAG)) {
+		const unsigned type = index->type;
+		if (UNIV_UNLIKELY(type & DICT_SPATIAL)) {
+			lock_prdt_t	prdt;
+			rtr_mbr_t	mbr;
+
+			rtr_get_mbr_from_tuple(entry, &mbr);
+
+			/* Use on stack MBR variable to test if a lock is
+			needed. If so, the predicate (MBR) will be allocated
+			from lock heap in lock_prdt_insert_check_and_lock() */
+			lock_init_prdt_from_mbr(
+				&prdt, &mbr, 0, NULL);
+
+			err = lock_prdt_insert_check_and_lock(
+				flags, rec, btr_cur_get_block(cursor),
+				index, thr, mtr, &prdt);
+			*inherit = false;
+		} else {
+#ifdef WITH_WSREP
+			trx_t* trx= thr_get_trx(thr);
+			/* If transaction scanning an unique secondary
+			key is wsrep high priority thread (brute
+			force) this scanning may involve GAP-locking
+			in the index. As this locking happens also
+			when applying replication events in high
+			priority applier threads, there is a
+			probability for lock conflicts between two
+			wsrep high priority threads. To avoid this
+			GAP-locking we mark that this transaction
+			is using unique key scan here. */
+			if ((type & (DICT_CLUSTERED | DICT_UNIQUE)) == DICT_UNIQUE
+			    && trx->is_wsrep()
+			    && wsrep_thd_is_BF(trx->mysql_thd, false)) {
+				trx->wsrep_UK_scan= true;
+			}
+#endif /* WITH_WSREP */
+			err = lock_rec_insert_check_and_lock(
+				flags, rec, btr_cur_get_block(cursor),
+				index, thr, mtr, inherit);
+#ifdef WITH_WSREP
+			trx->wsrep_UK_scan= false;
+#endif /* WITH_WSREP */
+		}
+	}
+
+	if (err != DB_SUCCESS
+	    || !(~flags | (BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG))
+	    || !dict_index_is_clust(index) || dict_index_is_ibuf(index)) {
+
+		return(err);
+	}
+
+	if (flags & BTR_NO_UNDO_LOG_FLAG) {
+		roll_ptr = roll_ptr_t(1) << ROLL_PTR_INSERT_FLAG_POS;
+		if (!(flags & BTR_KEEP_SYS_FLAG)) {
+upd_sys:
+			dfield_t* r = dtuple_get_nth_field(
+				entry, index->db_roll_ptr());
+			ut_ad(r->len == DATA_ROLL_PTR_LEN);
+			trx_write_roll_ptr(static_cast<byte*>(r->data),
+					   roll_ptr);
+		}
+	} else {
+		err = trx_undo_report_row_operation(thr, index, entry,
+						    NULL, 0, NULL, NULL,
+						    &roll_ptr);
+		if (err == DB_SUCCESS) {
+			goto upd_sys;
+		}
+	}
+
+	return(err);
+}
+
+/**
+Prefetch siblings of the leaf for the pessimistic operation.
+@param block	leaf page
+@param index    index of the page */
+static void btr_cur_prefetch_siblings(const buf_block_t *block,
+                                      const dict_index_t *index)
+{
+  ut_ad(page_is_leaf(block->frame));
+
+  if (index->is_ibuf())
+    return;
+
+  const page_t *page= block->frame;
+  uint32_t prev= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV));
+  uint32_t next= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT));
+
+  if (prev == FIL_NULL);
+  else if (index->table->space->acquire())
+    buf_read_page_background(index->table->space,
+                             page_id_t(block->page.id().space(), prev),
+                             block->zip_size(), false);
+  if (next == FIL_NULL);
+  else if (index->table->space->acquire())
+    buf_read_page_background(index->table->space,
+                             page_id_t(block->page.id().space(), next),
+                             block->zip_size(), false);
+}
+
+/*************************************************************//**
+Tries to perform an insert to a page in an index tree, next to cursor.
+It is assumed that mtr holds an x-latch on the page. The operation does
+not succeed if there is too little space on the page. If there is just
+one record on the page, the insert will always succeed; this is to
+prevent trying to split a page with just one record.
+@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
+dberr_t
+btr_cur_optimistic_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags: if not
+				zero, the parameters index and thr should be
+				specified */
+	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
+				cursor stays valid */
+	rec_offs**	offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap */
+	dtuple_t*	entry,	/*!< in/out: entry to insert */
+	rec_t**		rec,	/*!< out: pointer to inserted record if
+				succeed */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr,	/*!< in/out: query thread; can be NULL if
+				!(~flags
+				& (BTR_NO_LOCKING_FLAG
+				| BTR_NO_UNDO_LOG_FLAG)) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction;
+				if this function returns DB_SUCCESS on
+				a leaf page of a secondary index in a
+				compressed tablespace, the caller must
+				mtr_commit(mtr) before latching
+				any further pages */
+{
+	big_rec_t*	big_rec_vec	= NULL;
+	dict_index_t*	index;
+	page_cur_t*	page_cursor;
+	buf_block_t*	block;
+	page_t*		page;
+	rec_t*		dummy;
+	bool		leaf;
+	bool		reorg __attribute__((unused));
+	bool		inherit = true;
+	ulint		rec_size;
+	dberr_t		err;
+
+	ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG)));
+	*big_rec = NULL;
+
+	block = btr_cur_get_block(cursor);
+	page = buf_block_get_frame(block);
+	index = cursor->index;
+
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(!dict_index_is_online_ddl(index)
+	      || dict_index_is_clust(index)
+	      || (flags & BTR_CREATE_FLAG));
+	ut_ad(dtuple_check_typed(entry));
+
+#ifdef HAVE_valgrind
+	if (block->page.zip.data) {
+		MEM_CHECK_DEFINED(page, srv_page_size);
+		MEM_CHECK_DEFINED(block->page.zip.data, block->zip_size());
+	}
+#endif /* HAVE_valgrind */
+
+	leaf = page_is_leaf(page);
+
+	if (UNIV_UNLIKELY(entry->is_alter_metadata())) {
+		ut_ad(leaf);
+		goto convert_big_rec;
+	}
+
+	/* Calculate the record size when entry is converted to a record */
+	rec_size = rec_get_converted_size(index, entry, n_ext);
+
+	if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
+				   dtuple_get_n_fields(entry),
+				   block->zip_size())) {
+convert_big_rec:
+		/* The record is so big that we have to store some fields
+		externally on separate database pages */
+		big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
+
+		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
+
+			return(DB_TOO_BIG_RECORD);
+		}
+
+		rec_size = rec_get_converted_size(index, entry, n_ext);
+	}
+
+	if (block->page.zip.data && page_zip_is_too_big(index, entry)) {
+		if (big_rec_vec != NULL) {
+			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
+		}
+
+		return(DB_TOO_BIG_RECORD);
+	}
+
+	LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
+				      goto fail);
+
+	if (block->page.zip.data && leaf
+	    && (page_get_data_size(page) + rec_size
+		>= dict_index_zip_pad_optimal_page_size(index))) {
+		/* If compression padding tells us that insertion will
+		result in too packed up page i.e.: which is likely to
+		cause compression failure then don't do an optimistic
+		insertion. */
+fail:
+		err = DB_FAIL;
+
+		/* prefetch siblings of the leaf for the pessimistic
+		operation, if the page is leaf. */
+		if (page_is_leaf(page)) {
+			btr_cur_prefetch_siblings(block, index);
+		}
+fail_err:
+
+		if (big_rec_vec) {
+			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
+		}
+
+		return(err);
+	}
+
+	ulint	max_size = page_get_max_insert_size_after_reorganize(page, 1);
+	if (max_size < rec_size) {
+		goto fail;
+	}
+
+	const ulint n_recs = page_get_n_recs(page);
+	if (UNIV_UNLIKELY(n_recs >= 8189)) {
+		ut_ad(srv_page_size == 65536);
+		goto fail;
+	}
+
+	if (page_has_garbage(page)) {
+		if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT
+		    && n_recs > 1
+		    && page_get_max_insert_size(page, 1) < rec_size) {
+
+			goto fail;
+		}
+	}
+
+	/* If there have been many consecutive inserts to the
+	clustered index leaf page of an uncompressed table, check if
+	we have to split the page to reserve enough free space for
+	future updates of records. */
+
+	if (leaf && !block->page.zip.data && dict_index_is_clust(index)
+	    && page_get_n_recs(page) >= 2
+	    && dict_index_get_space_reserve() + rec_size > max_size
+	    && (btr_page_get_split_rec_to_right(cursor, &dummy)
+		|| btr_page_get_split_rec_to_left(cursor))) {
+		goto fail;
+	}
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	DBUG_LOG("ib_cur",
+		 "insert " << index->name << " (" << index->id << ") by "
+		 << ib::hex(thr ? thr->graph->trx->id : 0)
+		 << ' ' << rec_printer(entry).str());
+	DBUG_EXECUTE_IF("do_page_reorganize",
+			btr_page_reorganize(page_cursor, index, mtr););
+
+	/* Now, try the insert */
+	{
+		const rec_t*	page_cursor_rec = page_cur_get_rec(page_cursor);
+
+		/* Check locks and write to the undo log,
+		if specified */
+		err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
+						thr, mtr, &inherit);
+		if (err != DB_SUCCESS) {
+			goto fail_err;
+		}
+
+#ifdef UNIV_DEBUG
+		if (!(flags & BTR_CREATE_FLAG)
+		    && index->is_primary() && page_is_leaf(page)) {
+			const dfield_t* trx_id = dtuple_get_nth_field(
+				entry, dict_col_get_clust_pos(
+					dict_table_get_sys_col(index->table,
+							       DATA_TRX_ID),
+					index));
+
+			ut_ad(trx_id->len == DATA_TRX_ID_LEN);
+			ut_ad(trx_id[1].len == DATA_ROLL_PTR_LEN);
+			ut_ad(*static_cast<const byte*>
+			      (trx_id[1].data) & 0x80);
+			if (flags & BTR_NO_UNDO_LOG_FLAG) {
+				ut_ad(!memcmp(trx_id->data, reset_trx_id,
+					      DATA_TRX_ID_LEN));
+			} else {
+				ut_ad(thr->graph->trx->id);
+				ut_ad(thr->graph->trx->id
+				      == trx_read_trx_id(
+					      static_cast<const byte*>(
+							trx_id->data))
+				      || index->table->is_temporary());
+			}
+		}
+#endif
+
+		*rec = page_cur_tuple_insert(
+			page_cursor, entry, index, offsets, heap,
+			n_ext, mtr);
+
+		reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
+	}
+
+	if (*rec) {
+	} else if (block->page.zip.data) {
+		ut_ad(!index->table->is_temporary());
+		/* Reset the IBUF_BITMAP_FREE bits, because
+		page_cur_tuple_insert() will have attempted page
+		reorganize before failing. */
+		if (leaf
+		    && !dict_index_is_clust(index)) {
+			ibuf_reset_free_bits(block);
+		}
+
+		goto fail;
+	} else {
+		ut_ad(!reorg);
+
+		/* If the record did not fit, reorganize */
+		if (!btr_page_reorganize(page_cursor, index, mtr)) {
+			ut_ad(0);
+			goto fail;
+		}
+
+		ut_ad(page_get_max_insert_size(page, 1) == max_size);
+
+		reorg = TRUE;
+
+		*rec = page_cur_tuple_insert(page_cursor, entry, index,
+					     offsets, heap, n_ext, mtr);
+
+		if (UNIV_UNLIKELY(!*rec)) {
+			ib::fatal() <<  "Cannot insert tuple " << *entry
+				<< "into index " << index->name
+				<< " of table " << index->table->name
+				<< ". Max size: " << max_size;
+		}
+	}
+
+#ifdef BTR_CUR_HASH_ADAPT
+	if (!leaf) {
+# ifdef MYSQL_INDEX_DISABLE_AHI
+	} else if (index->disable_ahi) {
+# endif
+	} else if (entry->info_bits & REC_INFO_MIN_REC_FLAG) {
+		ut_ad(entry->is_metadata());
+		ut_ad(index->is_instant());
+		ut_ad(flags == BTR_NO_LOCKING_FLAG);
+	} else {
+		rw_lock_t* ahi_latch = btr_search_sys.get_latch(*index);
+		if (!reorg && cursor->flag == BTR_CUR_HASH) {
+			btr_search_update_hash_node_on_insert(
+				cursor, ahi_latch);
+		} else {
+			btr_search_update_hash_on_insert(cursor, ahi_latch);
+		}
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
+
+		lock_update_insert(block, *rec);
+	}
+
+	if (leaf
+	    && !dict_index_is_clust(index)
+	    && !index->table->is_temporary()) {
+		/* Update the free bits of the B-tree page in the
+		insert buffer bitmap. */
+
+		/* The free bits in the insert buffer bitmap must
+		never exceed the free space on a page.  It is safe to
+		decrement or reset the bits in the bitmap in a
+		mini-transaction that is committed before the
+		mini-transaction that affects the free space. */
+
+		/* It is unsafe to increment the bits in a separately
+		committed mini-transaction, because in crash recovery,
+		the free bits could momentarily be set too high. */
+
+		if (block->page.zip.data) {
+			/* Update the bits in the same mini-transaction. */
+			ibuf_update_free_bits_zip(block, mtr);
+		} else {
+			/* Decrement the bits in a separate
+			mini-transaction. */
+			ibuf_update_free_bits_if_full(
+				block, max_size,
+				rec_size + PAGE_DIR_SLOT_SIZE);
+		}
+	}
+
+	*big_rec = big_rec_vec;
+
+	return(DB_SUCCESS);
+}
+
+/*************************************************************//**
+Performs an insert on a page of an index tree. It is assumed that mtr
+holds an x-latch on the tree and on the cursor page. If the insert is
+made on the leaf level, to avoid deadlocks, mtr must also own x-latches
+to brothers of page, if those brothers exist.
+@return DB_SUCCESS or error number */
+dberr_t
+btr_cur_pessimistic_insert(
+/*=======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags: if not
+				zero, the parameter thr should be
+				specified; if no undo logging is specified,
+				then the caller must have reserved enough
+				free extents in the file space so that the
+				insertion will certainly succeed */
+	btr_cur_t*	cursor,	/*!< in: cursor after which to insert;
+				cursor stays valid */
+	rec_offs**	offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
+				that can be emptied */
+	dtuple_t*	entry,	/*!< in/out: entry to insert */
+	rec_t**		rec,	/*!< out: pointer to inserted record if
+				succeed */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr,	/*!< in/out: query thread; can be NULL if
+				!(~flags
+				& (BTR_NO_LOCKING_FLAG
+				| BTR_NO_UNDO_LOG_FLAG)) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	dict_index_t*	index		= cursor->index;
+	big_rec_t*	big_rec_vec	= NULL;
+	dberr_t		err;
+	bool		inherit = false;
+	bool		success;
+	uint32_t	n_reserved	= 0;
+
+	ut_ad(dtuple_check_typed(entry));
+	ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG)));
+
+	*big_rec = NULL;
+
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+					 MTR_MEMO_PAGE_X_FIX));
+	ut_ad(!dict_index_is_online_ddl(index)
+	      || dict_index_is_clust(index)
+	      || (flags & BTR_CREATE_FLAG));
+
+	cursor->flag = BTR_CUR_BINARY;
+
+	/* Check locks and write to undo log, if specified */
+
+	err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
+					thr, mtr, &inherit);
+
+	if (err != DB_SUCCESS) {
+
+		return(err);
+	}
+
+	if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
+		/* First reserve enough free space for the file segments
+		of the index tree, so that the insert will not fail because
+		of lack of space */
+
+		uint32_t n_extents = uint32_t(cursor->tree_height / 16 + 3);
+
+		success = fsp_reserve_free_extents(&n_reserved,
+						   index->table->space,
+						   n_extents, FSP_NORMAL, mtr);
+		if (!success) {
+			return(DB_OUT_OF_FILE_SPACE);
+		}
+	}
+
+	if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
+				   index->table->not_redundant(),
+				   dtuple_get_n_fields(entry),
+				   btr_cur_get_block(cursor)->zip_size())
+	    || UNIV_UNLIKELY(entry->is_alter_metadata()
+			     && !dfield_is_ext(
+				     dtuple_get_nth_field(
+					     entry,
+					     index->first_user_field())))) {
+		/* The record is so big that we have to store some fields
+		externally on separate database pages */
+
+		if (UNIV_LIKELY_NULL(big_rec_vec)) {
+			/* This should never happen, but we handle
+			the situation in a robust manner. */
+			ut_ad(0);
+			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
+		}
+
+		big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
+
+		if (big_rec_vec == NULL) {
+
+			index->table->space->release_free_extents(n_reserved);
+			return(DB_TOO_BIG_RECORD);
+		}
+	}
+
+	if (dict_index_get_page(index)
+	    == btr_cur_get_block(cursor)->page.id().page_no()) {
+
+		/* The page is the root page */
+		*rec = btr_root_raise_and_insert(
+			flags, cursor, offsets, heap, entry, n_ext, mtr);
+	} else {
+		*rec = btr_page_split_and_insert(
+			flags, cursor, offsets, heap, entry, n_ext, mtr);
+	}
+
+	if (*rec == NULL && os_has_said_disk_full) {
+		return(DB_OUT_OF_FILE_SPACE);
+	}
+
+	ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec
+	      || dict_index_is_spatial(index));
+
+	if (!(flags & BTR_NO_LOCKING_FLAG)) {
+		ut_ad(!index->table->is_temporary());
+		if (dict_index_is_spatial(index)) {
+			/* Do nothing */
+		} else {
+			/* The cursor might be moved to the other page
+			and the max trx id field should be updated after
+			the cursor was fixed. */
+			if (!dict_index_is_clust(index)) {
+				page_update_max_trx_id(
+					btr_cur_get_block(cursor),
+					btr_cur_get_page_zip(cursor),
+					thr_get_trx(thr)->id, mtr);
+			}
+
+			if (!page_rec_is_infimum(btr_cur_get_rec(cursor))
+			    || !page_has_prev(btr_cur_get_page(cursor))) {
+				/* split and inserted need to call
+				lock_update_insert() always. */
+				inherit = true;
+			}
+		}
+	}
+
+	if (!page_is_leaf(btr_cur_get_page(cursor))) {
+		ut_ad(!big_rec_vec);
+	} else {
+#ifdef BTR_CUR_HASH_ADAPT
+# ifdef MYSQL_INDEX_DISABLE_AHI
+		if (index->disable_ahi); else
+# endif
+		if (entry->info_bits & REC_INFO_MIN_REC_FLAG) {
+			ut_ad(entry->is_metadata());
+			ut_ad(index->is_instant());
+			ut_ad(flags & BTR_NO_LOCKING_FLAG);
+			ut_ad(!(flags & BTR_CREATE_FLAG));
+		} else {
+			btr_search_update_hash_on_insert(
+				cursor, btr_search_sys.get_latch(*index));
+		}
+#endif /* BTR_CUR_HASH_ADAPT */
+		if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) {
+
+			lock_update_insert(btr_cur_get_block(cursor), *rec);
+		}
+	}
+
+	index->table->space->release_free_extents(n_reserved);
+	*big_rec = big_rec_vec;
+
+	return(DB_SUCCESS);
+}
+
+/*==================== B-TREE UPDATE =========================*/
+
+/*************************************************************//**
+For an update, checks the locks and does the undo logging.
+@return DB_SUCCESS, DB_WAIT_LOCK, or error number */
+UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+btr_cur_upd_lock_and_undo(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on record to update */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets() on cursor */
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread
+				(can be NULL if BTR_NO_LOCKING_FLAG) */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	roll_ptr_t*	roll_ptr)/*!< out: roll pointer */
+{
+	dict_index_t*	index;
+	const rec_t*	rec;
+	dberr_t		err;
+
+	ut_ad((thr != NULL) || (flags & BTR_NO_LOCKING_FLAG));
+
+	rec = btr_cur_get_rec(cursor);
+	index = cursor->index;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(mtr->is_named_space(index->table->space));
+
+	if (!dict_index_is_clust(index)) {
+		ut_ad(dict_index_is_online_ddl(index)
+		      == !!(flags & BTR_CREATE_FLAG));
+
+		/* We do undo logging only when we update a clustered index
+		record */
+		return(lock_sec_rec_modify_check_and_lock(
+			       flags, btr_cur_get_block(cursor), rec,
+			       index, thr, mtr));
+	}
+
+	/* Check if we have to wait for a lock: enqueue an explicit lock
+	request if yes */
+
+	if (!(flags & BTR_NO_LOCKING_FLAG)) {
+		err = lock_clust_rec_modify_check_and_lock(
+			flags, btr_cur_get_block(cursor), rec, index,
+			offsets, thr);
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	}
+
+	/* Append the info about the update in the undo log */
+
+	return((flags & BTR_NO_UNDO_LOG_FLAG)
+	       ? DB_SUCCESS
+	       : trx_undo_report_row_operation(
+		       thr, index, NULL, update,
+		       cmpl_info, rec, offsets, roll_ptr));
+}
+
+/** Write DB_TRX_ID,DB_ROLL_PTR to a clustered index entry.
+@param[in,out]	entry		clustered index entry
+@param[in]	index		clustered index
+@param[in]	trx_id		DB_TRX_ID
+@param[in]	roll_ptr	DB_ROLL_PTR */
+static void btr_cur_write_sys(
+	dtuple_t*		entry,
+	const dict_index_t*	index,
+	trx_id_t		trx_id,
+	roll_ptr_t		roll_ptr)
+{
+	dfield_t* t = dtuple_get_nth_field(entry, index->db_trx_id());
+	ut_ad(t->len == DATA_TRX_ID_LEN);
+	trx_write_trx_id(static_cast<byte*>(t->data), trx_id);
+	dfield_t* r = dtuple_get_nth_field(entry, index->db_roll_ptr());
+	ut_ad(r->len == DATA_ROLL_PTR_LEN);
+	trx_write_roll_ptr(static_cast<byte*>(r->data), roll_ptr);
+}
+
+/** Update DB_TRX_ID, DB_ROLL_PTR in a clustered index record.
+@param[in,out]  block           clustered index leaf page
+@param[in,out]  rec             clustered index record
+@param[in]      index           clustered index
+@param[in]      offsets         rec_get_offsets(rec, index)
+@param[in]      trx             transaction
+@param[in]      roll_ptr        DB_ROLL_PTR value
+@param[in,out]  mtr             mini-transaction */
+static void btr_cur_upd_rec_sys(buf_block_t *block, rec_t *rec,
+                                dict_index_t *index, const rec_offs *offsets,
+                                const trx_t *trx, roll_ptr_t roll_ptr,
+                                mtr_t *mtr)
+{
+  ut_ad(index->is_primary());
+  ut_ad(rec_offs_validate(rec, index, offsets));
+
+  if (UNIV_LIKELY_NULL(block->page.zip.data))
+  {
+    page_zip_write_trx_id_and_roll_ptr(block, rec, offsets, index->db_trx_id(),
+                                       trx->id, roll_ptr, mtr);
+    return;
+  }
+
+  ulint offset= index->trx_id_offset;
+
+  if (!offset)
+    offset= row_get_trx_id_offset(index, offsets);
+
+  compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
+
+  /* During IMPORT the trx id in the record can be in the future, if
+  the .ibd file is being imported from another instance. During IMPORT
+  roll_ptr will be 0. */
+  ut_ad(roll_ptr == 0 ||
+        lock_check_trx_id_sanity(trx_read_trx_id(rec + offset),
+                                 rec, index, offsets));
+
+  byte sys[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
+
+  trx_write_trx_id(sys, trx->id);
+  trx_write_roll_ptr(sys + DATA_TRX_ID_LEN, roll_ptr);
+
+  ulint d= 0;
+  const byte *src= nullptr;
+  byte *dest= rec + offset;
+  ulint len= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+  if (UNIV_LIKELY(index->trx_id_offset))
+  {
+    const rec_t *prev= page_rec_get_prev_const(rec);
+    if (UNIV_UNLIKELY(prev == rec))
+      ut_ad(0);
+    else if (page_rec_is_infimum(prev));
+    else
+      for (src= prev + offset; d < DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; d++)
+        if (src[d] != sys[d])
+          break;
+    if (d > 6 && memcmp(dest, sys, d))
+    {
+      /* We save space by replacing a single record
+
+      WRITE,page_offset(dest),byte[13]
+
+      with two records:
+
+      MEMMOVE,page_offset(dest),d(1 byte),offset(1..3 bytes),
+      WRITE|0x80,0,byte[13-d]
+
+      The single WRITE record would be x+13 bytes long, with x>2.
+      The MEMMOVE record would be up to x+1+3 = x+4 bytes, and the
+      second WRITE would be 1+1+13-d = 15-d bytes.
+
+      The total size is: x+13 versus x+4+15-d = x+19-d bytes.
+      To save space, we must have d>6, that is, the complete DB_TRX_ID and
+      the first byte(s) of DB_ROLL_PTR must match the previous record. */
+      memcpy(dest, src, d);
+      mtr->memmove(*block, page_offset(dest), page_offset(src), d);
+      dest+= d;
+      len-= d;
+      /* DB_TRX_ID,DB_ROLL_PTR must be unique in each record when
+      DB_TRX_ID refers to an active transaction. */
+      ut_ad(len);
+    }
+    else
+      d= 0;
+  }
+
+  if (UNIV_LIKELY(len)) /* extra safety, to avoid corrupting the log */
+    mtr->memcpy<mtr_t::MAYBE_NOP>(*block, dest, sys + d, len);
+}
+
+/*************************************************************//**
+See if there is enough place in the page modification log to log
+an update-in-place.
+
+@retval false if out of space; IBUF_BITMAP_FREE will be reset
+outside mtr if the page was recompressed
+@retval true if enough place;
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
+a secondary index leaf page. This has to be done either within the
+same mini-transaction, or by invoking ibuf_reset_free_bits() before
+mtr_commit(mtr). */
+bool
+btr_cur_update_alloc_zip_func(
+/*==========================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	page_cur_t*	cursor,	/*!< in/out: B-tree page cursor */
+	dict_index_t*	index,	/*!< in: the index corresponding to cursor */
+#ifdef UNIV_DEBUG
+	rec_offs*	offsets,/*!< in/out: offsets of the cursor record */
+#endif /* UNIV_DEBUG */
+	ulint		length,	/*!< in: size needed */
+	bool		create,	/*!< in: true=delete-and-insert,
+				false=update-in-place */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+
+	/* Have a local copy of the variables as these can change
+	dynamically. */
+	const page_t*	page = page_cur_get_page(cursor);
+
+	ut_ad(page_zip == page_cur_get_page_zip(cursor));
+	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
+
+	if (page_zip_available(page_zip, dict_index_is_clust(index),
+			       length, create)) {
+		return(true);
+	}
+
+	if (!page_zip->m_nonempty && !page_has_garbage(page)) {
+		/* The page has been freshly compressed, so
+		reorganizing it will not help. */
+		return(false);
+	}
+
+	if (create && page_is_leaf(page)
+	    && (length + page_get_data_size(page)
+		>= dict_index_zip_pad_optimal_page_size(index))) {
+		return(false);
+	}
+
+	if (!btr_page_reorganize(cursor, index, mtr)) {
+		goto out_of_space;
+	}
+
+	rec_offs_make_valid(page_cur_get_rec(cursor), index,
+			    page_is_leaf(page), offsets);
+
+	/* After recompressing a page, we must make sure that the free
+	bits in the insert buffer bitmap will not exceed the free
+	space on the page.  Because this function will not attempt
+	recompression unless page_zip_available() fails above, it is
+	safe to reset the free bits if page_zip_available() fails
+	again, below.  The free bits can safely be reset in a separate
+	mini-transaction.  If page_zip_available() succeeds below, we
+	can be sure that the btr_page_reorganize() above did not reduce
+	the free space available on the page. */
+
+	if (page_zip_available(page_zip, dict_index_is_clust(index),
+			       length, create)) {
+		return(true);
+	}
+
+out_of_space:
+	ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
+
+	/* Out of space: reset the free bits. */
+	if (!dict_index_is_clust(index)
+	    && !index->table->is_temporary()
+	    && page_is_leaf(page)) {
+		ibuf_reset_free_bits(page_cur_get_block(cursor));
+	}
+
+	return(false);
+}
+
+/** Apply an update vector to a record. No field size changes are allowed.
+
+This is usually invoked on a clustered index. The only use case for a
+secondary index is row_ins_sec_index_entry_by_modify() or its
+counterpart in ibuf_insert_to_index_page().
+@param[in,out]  rec     index record
+@param[in]      index   the index of the record
+@param[in]      offsets rec_get_offsets(rec, index)
+@param[in]      update  update vector
+@param[in,out]  block   index page
+@param[in,out]  mtr     mini-transaction */
+void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index,
+                              const rec_offs *offsets, const upd_t *update,
+                              buf_block_t *block, mtr_t *mtr)
+{
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!index->table->skip_alter_undo);
+	ut_ad(!block->page.zip.data || index->table->not_redundant());
+
+#ifdef UNIV_DEBUG
+	if (rec_offs_comp(offsets)) {
+		switch (rec_get_status(rec)) {
+		case REC_STATUS_ORDINARY:
+			break;
+		case REC_STATUS_INSTANT:
+			ut_ad(index->is_instant());
+			break;
+		case REC_STATUS_NODE_PTR:
+		case REC_STATUS_INFIMUM:
+		case REC_STATUS_SUPREMUM:
+			ut_ad("wrong record status in update" == 0);
+		}
+	}
+#endif /* UNIV_DEBUG */
+
+	static_assert(REC_INFO_BITS_SHIFT == 0, "compatibility");
+	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+		ut_ad(rec_offs_comp(offsets));
+		byte* info_bits = &rec[-REC_NEW_INFO_BITS];
+		const bool flip_del_mark = (*info_bits ^ update->info_bits)
+			& REC_INFO_DELETED_FLAG;
+		*info_bits &= byte(~REC_INFO_BITS_MASK);
+		*info_bits |= update->info_bits;
+
+		if (flip_del_mark) {
+			page_zip_rec_set_deleted(block, rec, update->info_bits
+						 & REC_INFO_DELETED_FLAG, mtr);
+		}
+	} else {
+		byte* info_bits = &rec[rec_offs_comp(offsets)
+				       ? -REC_NEW_INFO_BITS
+				       : -REC_OLD_INFO_BITS];
+
+		mtr->write<1,mtr_t::MAYBE_NOP>(*block, info_bits,
+					       (*info_bits
+						& ~REC_INFO_BITS_MASK)
+					       | update->info_bits);
+	}
+
+	for (ulint i = 0; i < update->n_fields; i++) {
+		const upd_field_t* uf = upd_get_nth_field(update, i);
+		if (upd_fld_is_virtual_col(uf) && !index->has_virtual()) {
+			continue;
+		}
+		const ulint n = uf->field_no;
+
+		ut_ad(!dfield_is_ext(&uf->new_val)
+		      == !rec_offs_nth_extern(offsets, n));
+		ut_ad(!rec_offs_nth_default(offsets, n));
+
+		if (UNIV_UNLIKELY(dfield_is_null(&uf->new_val))) {
+			if (rec_offs_nth_sql_null(offsets, n)) {
+				ut_ad(index->table->is_instant());
+				ut_ad(n >= index->n_core_fields);
+				continue;
+			}
+
+			ut_ad(!index->table->not_redundant());
+			switch (ulint size = rec_get_nth_field_size(rec, n)) {
+			case 0:
+				break;
+			case 1:
+				mtr->write<1,mtr_t::MAYBE_NOP>(
+					*block,
+					rec_get_field_start_offs(rec, n) + rec,
+					0U);
+				break;
+			default:
+				mtr->memset(
+					block,
+					page_offset(rec_get_field_start_offs(
+							    rec, n) + rec),
+					size, 0);
+			}
+			ulint l = rec_get_1byte_offs_flag(rec)
+				? (n + 1) : (n + 1) * 2;
+			byte* b = rec - REC_N_OLD_EXTRA_BYTES - l;
+			compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
+					    == REC_2BYTE_SQL_NULL_MASK);
+			mtr->write<1>(*block, b,
+				      byte(*b | REC_1BYTE_SQL_NULL_MASK));
+			continue;
+		}
+
+		ulint len;
+		byte* data = rec_get_nth_field(rec, offsets, n, &len);
+		if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+			ut_ad(len == uf->new_val.len);
+			memcpy(data, uf->new_val.data, len);
+			continue;
+		}
+
+		if (UNIV_UNLIKELY(len != uf->new_val.len)) {
+			ut_ad(len == UNIV_SQL_NULL);
+			ut_ad(!rec_offs_comp(offsets));
+			len = uf->new_val.len;
+			ut_ad(len == rec_get_nth_field_size(rec, n));
+			ulint l = rec_get_1byte_offs_flag(rec)
+				? (n + 1) : (n + 1) * 2;
+			byte* b = rec - REC_N_OLD_EXTRA_BYTES - l;
+			compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
+					    == REC_2BYTE_SQL_NULL_MASK);
+			mtr->write<1>(*block, b,
+				      byte(*b & ~REC_1BYTE_SQL_NULL_MASK));
+		}
+
+		if (len) {
+			mtr->memcpy<mtr_t::MAYBE_NOP>(*block, data,
+						      uf->new_val.data, len);
+		}
+	}
+
+	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+		page_zip_write_rec(block, rec, index, offsets, 0, mtr);
+	}
+}
+
+/*************************************************************//**
+Updates a record when the update causes no size changes in its fields.
+We assume here that the ordering fields of the record do not change.
+@return locking or undo log related error code, or
+@retval DB_SUCCESS on success
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
+dberr_t
+btr_cur_update_in_place(
+/*====================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
+				cursor stays valid and positioned on the
+				same record */
+	rec_offs*	offsets,/*!< in/out: offsets on cursor->page_cur.rec */
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
+				is a secondary index, the caller must
+				mtr_commit(mtr) before latching any
+				further pages */
+{
+	dict_index_t*	index;
+	dberr_t		err;
+	rec_t*		rec;
+	roll_ptr_t	roll_ptr	= 0;
+	ulint		was_delete_marked;
+
+	ut_ad(page_is_leaf(cursor->page_cur.block->frame));
+	rec = btr_cur_get_rec(cursor);
+	index = cursor->index;
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+	ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
+	      || index->table->is_temporary());
+	/* The insert buffer tree should never be updated in place. */
+	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
+	      || dict_index_is_clust(index));
+	ut_ad(thr_get_trx(thr)->id == trx_id
+	      || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP)))
+	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
+		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
+	ut_ad(fil_page_index_page_check(btr_cur_get_page(cursor)));
+	ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id);
+	ut_ad(!(update->info_bits & REC_INFO_MIN_REC_FLAG));
+
+	DBUG_LOG("ib_cur",
+		 "update-in-place " << index->name << " (" << index->id
+		 << ") by " << ib::hex(trx_id) << ": "
+		 << rec_printer(rec, offsets).str());
+
+	buf_block_t* block = btr_cur_get_block(cursor);
+	page_zip_des_t*	page_zip = buf_block_get_page_zip(block);
+
+	/* Check that enough space is available on the compressed page. */
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		ut_ad(!index->table->is_temporary());
+
+		if (!btr_cur_update_alloc_zip(
+			    page_zip, btr_cur_get_page_cur(cursor),
+			    index, offsets, rec_offs_size(offsets),
+			    false, mtr)) {
+			return(DB_ZIP_OVERFLOW);
+		}
+
+		rec = btr_cur_get_rec(cursor);
+	}
+
+	/* Do lock checking and undo logging */
+	err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
+					update, cmpl_info,
+					thr, mtr, &roll_ptr);
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		/* We may need to update the IBUF_BITMAP_FREE
+		bits after a reorganize that was done in
+		btr_cur_update_alloc_zip(). */
+		goto func_exit;
+	}
+
+	if (!(flags & BTR_KEEP_SYS_FLAG)) {
+		btr_cur_upd_rec_sys(block, rec, index, offsets,
+				    thr_get_trx(thr), roll_ptr, mtr);
+	}
+
+	was_delete_marked = rec_get_deleted_flag(
+		rec, page_is_comp(buf_block_get_frame(block)));
+	/* In delete-marked records, DB_TRX_ID must always refer to an
+	existing undo log record. */
+	ut_ad(!was_delete_marked
+	      || !dict_index_is_clust(index)
+	      || row_get_rec_trx_id(rec, index, offsets));
+
+#ifdef BTR_CUR_HASH_ADAPT
+	{
+		rw_lock_t* ahi_latch = block->index
+			? btr_search_sys.get_latch(*index) : NULL;
+		if (ahi_latch) {
+			/* TO DO: Can we skip this if none of the fields
+			index->search_info->curr_n_fields
+			are being updated? */
+
+			/* The function row_upd_changes_ord_field_binary
+			does not work on a secondary index. */
+
+			if (!dict_index_is_clust(index)
+			    || row_upd_changes_ord_field_binary(
+				    index, update, thr, NULL, NULL)) {
+				ut_ad(!(update->info_bits
+					& REC_INFO_MIN_REC_FLAG));
+				/* Remove possible hash index pointer
+				to this record */
+				btr_search_update_hash_on_delete(cursor);
+			}
+
+			rw_lock_x_lock(ahi_latch);
+		}
+
+		assert_block_ahi_valid(block);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+		btr_cur_upd_rec_in_place(rec, index, offsets, update, block,
+					 mtr);
+
+#ifdef BTR_CUR_HASH_ADAPT
+		if (ahi_latch) {
+			rw_lock_x_unlock(ahi_latch);
+		}
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	if (was_delete_marked
+	    && !rec_get_deleted_flag(
+		    rec, page_is_comp(buf_block_get_frame(block)))) {
+		/* The new updated record owns its possible externally
+		stored fields */
+
+		btr_cur_unmark_extern_fields(block, rec, index, offsets, mtr);
+	}
+
+	ut_ad(err == DB_SUCCESS);
+
+func_exit:
+	if (page_zip
+	    && !(flags & BTR_KEEP_IBUF_BITMAP)
+	    && !dict_index_is_clust(index)
+	    && page_is_leaf(buf_block_get_frame(block))) {
+		/* Update the free bits in the insert buffer. */
+		ut_ad(!index->table->is_temporary());
+		ibuf_update_free_bits_zip(block, mtr);
+	}
+
+	return(err);
+}
+
+/** Trim a metadata record during the rollback of instant ALTER TABLE.
+@param[in]	entry	metadata tuple
+@param[in]	index	primary key
+@param[in]	update	update vector for the rollback */
+ATTRIBUTE_COLD
+static void btr_cur_trim_alter_metadata(dtuple_t* entry,
+					const dict_index_t* index,
+					const upd_t* update)
+{
+	ut_ad(index->is_instant());
+	ut_ad(update->is_alter_metadata());
+	ut_ad(entry->is_alter_metadata());
+
+	ut_ad(update->fields[0].field_no == index->first_user_field());
+	ut_ad(update->fields[0].new_val.ext);
+	ut_ad(update->fields[0].new_val.len == FIELD_REF_SIZE);
+	ut_ad(entry->n_fields - 1 == index->n_fields);
+
+	const byte* ptr = static_cast<const byte*>(
+		update->fields[0].new_val.data);
+	ut_ad(!mach_read_from_4(ptr + BTR_EXTERN_LEN));
+	ut_ad(mach_read_from_4(ptr + BTR_EXTERN_LEN + 4) > 4);
+	ut_ad(mach_read_from_4(ptr + BTR_EXTERN_OFFSET) == FIL_PAGE_DATA);
+	ut_ad(mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID)
+	      == index->table->space->id);
+
+	ulint n_fields = update->fields[1].field_no;
+	ut_ad(n_fields <= index->n_fields);
+	if (n_fields != index->n_uniq) {
+		ut_ad(n_fields
+		      >= index->n_core_fields);
+		entry->n_fields = n_fields;
+		return;
+	}
+
+	/* This is based on dict_table_t::deserialise_columns()
+	and btr_cur_instant_init_low(). */
+	mtr_t mtr;
+	mtr.start();
+	buf_block_t* block = buf_page_get(
+		page_id_t(index->table->space->id,
+			  mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)),
+		0, RW_S_LATCH, &mtr);
+	buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
+	ut_ad(fil_page_get_type(block->frame) == FIL_PAGE_TYPE_BLOB);
+	ut_ad(mach_read_from_4(&block->frame[FIL_PAGE_DATA
+					     + BTR_BLOB_HDR_NEXT_PAGE_NO])
+	      == FIL_NULL);
+	ut_ad(mach_read_from_4(&block->frame[FIL_PAGE_DATA
+					     + BTR_BLOB_HDR_PART_LEN])
+	      == mach_read_from_4(ptr + BTR_EXTERN_LEN + 4));
+	n_fields = mach_read_from_4(
+		&block->frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE])
+		+ index->first_user_field();
+	/* Rollback should not increase the number of fields. */
+	ut_ad(n_fields <= index->n_fields);
+	ut_ad(n_fields + 1 <= entry->n_fields);
+	/* dict_index_t::clear_instant_alter() cannot be invoked while
+	rollback of an instant ALTER TABLE transaction is in progress
+	for an is_alter_metadata() record. */
+	ut_ad(n_fields >= index->n_core_fields);
+
+	mtr.commit();
+	entry->n_fields = n_fields + 1;
+}
+
+/** Trim an update tuple due to instant ADD COLUMN, if needed.
+For normal records, the trailing instantly added fields that match
+the initial default values are omitted.
+
+For the special metadata record on a table on which instant
+ADD COLUMN has already been executed, both ADD COLUMN and the
+rollback of ADD COLUMN need to be handled specially.
+
+@param[in,out]	entry	index entry
+@param[in]	index	index
+@param[in]	update	update vector
+@param[in]	thr	execution thread */
+static inline
+void
+btr_cur_trim(
+	dtuple_t*		entry,
+	const dict_index_t*	index,
+	const upd_t*		update,
+	const que_thr_t*	thr)
+{
+	if (!index->is_instant()) {
+	} else if (UNIV_UNLIKELY(update->is_metadata())) {
+		/* We are either updating a metadata record
+		(instant ALTER TABLE on a table where instant ALTER was
+		already executed) or rolling back such an operation. */
+		ut_ad(!upd_get_nth_field(update, 0)->orig_len);
+		ut_ad(entry->is_metadata());
+
+		if (thr->graph->trx->in_rollback) {
+			/* This rollback can occur either as part of
+			ha_innobase::commit_inplace_alter_table() rolling
+			back after a failed innobase_add_instant_try(),
+			or as part of crash recovery. Either way, the
+			table will be in the data dictionary cache, with
+			the instantly added columns going to be removed
+			later in the rollback. */
+			ut_ad(index->table->cached);
+			/* The DB_TRX_ID,DB_ROLL_PTR are always last,
+			and there should be some change to roll back.
+			The first field in the update vector is the
+			first instantly added column logged by
+			innobase_add_instant_try(). */
+			ut_ad(update->n_fields > 2);
+			if (update->is_alter_metadata()) {
+				btr_cur_trim_alter_metadata(
+					entry, index, update);
+				return;
+			}
+			ut_ad(!entry->is_alter_metadata());
+
+			ulint n_fields = upd_get_nth_field(update, 0)
+				->field_no;
+			ut_ad(n_fields + 1 >= entry->n_fields);
+			entry->n_fields = n_fields;
+		}
+	} else {
+		entry->trim(*index);
+	}
+}
+
+/*************************************************************//**
+Tries to update a record on a page in an index tree. It is assumed that mtr
+holds an x-latch on the page. The operation does not succeed if there is too
+little space on the page or if the update would result in too empty a page,
+so that tree compression is recommended. We assume here that the ordering
+fields of the record do not change.
+@return error code, including
+@retval DB_SUCCESS on success
+@retval DB_OVERFLOW if the updated record does not fit
+@retval DB_UNDERFLOW if the page would become too empty
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
+dberr_t
+btr_cur_optimistic_update(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
+				cursor stays valid and positioned on the
+				same record */
+	rec_offs**	offsets,/*!< out: offsets on cursor->page_cur.rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to NULL or memory heap */
+	const upd_t*	update,	/*!< in: update vector; this must also
+				contain trx id and roll ptr fields */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
+				is a secondary index, the caller must
+				mtr_commit(mtr) before latching any
+				further pages */
+{
+	dict_index_t*	index;
+	page_cur_t*	page_cursor;
+	dberr_t		err;
+	buf_block_t*	block;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	rec_t*		rec;
+	ulint		max_size;
+	ulint		new_rec_size;
+	ulint		old_rec_size;
+	ulint		max_ins_size = 0;
+	dtuple_t*	new_entry;
+	roll_ptr_t	roll_ptr;
+	ulint		i;
+
+	block = btr_cur_get_block(cursor);
+	page = buf_block_get_frame(block);
+	rec = btr_cur_get_rec(cursor);
+	index = cursor->index;
+	ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
+	      || index->table->is_temporary());
+	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	/* This is intended only for leaf page updates */
+	ut_ad(page_is_leaf(page));
+	/* The insert buffer tree should never be updated in place. */
+	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
+	      || dict_index_is_clust(index));
+	ut_ad(thr_get_trx(thr)->id == trx_id
+	      || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP)))
+	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
+		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
+	ut_ad(fil_page_index_page_check(page));
+	ut_ad(btr_page_get_index_id(page) == index->id);
+
+	*offsets = rec_get_offsets(rec, index, *offsets, index->n_core_fields,
+				   ULINT_UNDEFINED, heap);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	ut_a(!rec_offs_any_null_extern(rec, *offsets)
+	     || thr_get_trx(thr) == trx_roll_crash_recv_trx);
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+	if (UNIV_LIKELY(!update->is_metadata())
+	    && !row_upd_changes_field_size_or_external(index, *offsets,
+						       update)) {
+
+		/* The simplest and the most common case: the update does not
+		change the size of any field and none of the updated fields is
+		externally stored in rec or update, and there is enough space
+		on the compressed page to log the update. */
+
+		return(btr_cur_update_in_place(
+			       flags, cursor, *offsets, update,
+			       cmpl_info, thr, trx_id, mtr));
+	}
+
+	if (rec_offs_any_extern(*offsets)) {
+any_extern:
+		ut_ad(!index->is_ibuf());
+		/* Externally stored fields are treated in pessimistic
+		update */
+
+		/* prefetch siblings of the leaf for the pessimistic
+		operation. */
+		btr_cur_prefetch_siblings(block, index);
+
+		return(DB_OVERFLOW);
+	}
+
+	if (rec_is_metadata(rec, *index) && index->table->instant) {
+		goto any_extern;
+	}
+
+	for (i = 0; i < upd_get_n_fields(update); i++) {
+		if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
+
+			goto any_extern;
+		}
+	}
+
+	DBUG_LOG("ib_cur",
+		 "update " << index->name << " (" << index->id << ") by "
+		 << ib::hex(trx_id) << ": "
+		 << rec_printer(rec, *offsets).str());
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	if (!*heap) {
+		*heap = mem_heap_create(
+			rec_offs_size(*offsets)
+			+ DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets)));
+	}
+
+	new_entry = row_rec_to_index_entry(rec, index, *offsets, *heap);
+	ut_ad(!dtuple_get_n_ext(new_entry));
+
+	/* The page containing the clustered index record
+	corresponding to new_entry is latched in mtr.
+	Thus the following call is safe. */
+	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
+						     *heap);
+	btr_cur_trim(new_entry, index, update, thr);
+	old_rec_size = rec_offs_size(*offsets);
+	new_rec_size = rec_get_converted_size(index, new_entry, 0);
+
+	page_zip = buf_block_get_page_zip(block);
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (page_zip) {
+		ut_ad(!index->table->is_temporary());
+
+		if (page_zip_rec_needs_ext(new_rec_size, page_is_comp(page),
+					   dict_index_get_n_fields(index),
+					   block->zip_size())) {
+			goto any_extern;
+		}
+
+		if (!btr_cur_update_alloc_zip(
+			    page_zip, page_cursor, index, *offsets,
+			    new_rec_size, true, mtr)) {
+			return(DB_ZIP_OVERFLOW);
+		}
+
+		rec = page_cur_get_rec(page_cursor);
+	}
+
+	/* We limit max record size to 16k even for 64k page size. */
+	if (new_rec_size >= COMPRESSED_REC_MAX_DATA_SIZE ||
+			(!dict_table_is_comp(index->table)
+			 && new_rec_size >= REDUNDANT_REC_MAX_DATA_SIZE)) {
+		err = DB_OVERFLOW;
+
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(new_rec_size
+			  >= (page_get_free_space_of_empty(page_is_comp(page))
+			      / 2))) {
+		/* We may need to update the IBUF_BITMAP_FREE
+		bits after a reorganize that was done in
+		btr_cur_update_alloc_zip(). */
+		err = DB_OVERFLOW;
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(page_get_data_size(page)
+			  - old_rec_size + new_rec_size
+			  < BTR_CUR_PAGE_COMPRESS_LIMIT(index))) {
+		/* We may need to update the IBUF_BITMAP_FREE
+		bits after a reorganize that was done in
+		btr_cur_update_alloc_zip(). */
+
+		/* The page would become too empty */
+		err = DB_UNDERFLOW;
+		goto func_exit;
+	}
+
+	/* We do not attempt to reorganize if the page is compressed.
+	This is because the page may fail to compress after reorganization. */
+	max_size = page_zip
+		? page_get_max_insert_size(page, 1)
+		: (old_rec_size
+		   + page_get_max_insert_size_after_reorganize(page, 1));
+
+	if (!page_zip) {
+		max_ins_size = page_get_max_insert_size_after_reorganize(
+				page, 1);
+	}
+
+	if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
+	       && (max_size >= new_rec_size))
+	      || (page_get_n_recs(page) <= 1))) {
+
+		/* We may need to update the IBUF_BITMAP_FREE
+		bits after a reorganize that was done in
+		btr_cur_update_alloc_zip(). */
+
+		/* There was not enough space, or it did not pay to
+		reorganize: for simplicity, we decide what to do assuming a
+		reorganization is needed, though it might not be necessary */
+
+		err = DB_OVERFLOW;
+		goto func_exit;
+	}
+
+	/* Do lock checking and undo logging */
+	err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
+					update, cmpl_info,
+					thr, mtr, &roll_ptr);
+	if (err != DB_SUCCESS) {
+		/* We may need to update the IBUF_BITMAP_FREE
+		bits after a reorganize that was done in
+		btr_cur_update_alloc_zip(). */
+		goto func_exit;
+	}
+
+	/* Ok, we may do the replacement. Store on the page infimum the
+	explicit locks on rec, before deleting rec (see the comment in
+	btr_cur_pessimistic_update). */
+	if (!dict_table_is_locking_disabled(index->table)) {
+		lock_rec_store_on_page_infimum(block, rec);
+	}
+
+	if (UNIV_UNLIKELY(update->is_metadata())) {
+		ut_ad(new_entry->is_metadata());
+		ut_ad(index->is_instant());
+		/* This can be innobase_add_instant_try() performing a
+		subsequent instant ADD COLUMN, or its rollback by
+		row_undo_mod_clust_low(). */
+		ut_ad(flags & BTR_NO_LOCKING_FLAG);
+	} else {
+		btr_search_update_hash_on_delete(cursor);
+	}
+
+	page_cur_delete_rec(page_cursor, index, *offsets, mtr);
+
+	page_cur_move_to_prev(page_cursor);
+
+	if (!(flags & BTR_KEEP_SYS_FLAG)) {
+		btr_cur_write_sys(new_entry, index, trx_id, roll_ptr);
+	}
+
+	/* There are no externally stored columns in new_entry */
+	rec = btr_cur_insert_if_possible(
+		cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr);
+	ut_a(rec); /* <- We calculated above the insert would fit */
+
+	if (UNIV_UNLIKELY(update->is_metadata())) {
+		/* We must empty the PAGE_FREE list, because if this
+		was a rollback, the shortened metadata record
+		would have too many fields, and we would be unable to
+		know the size of the freed record. */
+		btr_page_reorganize(page_cursor, index, mtr);
+	} else if (!dict_table_is_locking_disabled(index->table)) {
+		/* Restore the old explicit lock state on the record */
+		lock_rec_restore_from_page_infimum(block, rec, block);
+	}
+
+	page_cur_move_to_next(page_cursor);
+	ut_ad(err == DB_SUCCESS);
+
+func_exit:
+	if (!(flags & BTR_KEEP_IBUF_BITMAP)
+	    && !dict_index_is_clust(index)) {
+		/* Update the free bits in the insert buffer. */
+		if (page_zip) {
+			ut_ad(!index->table->is_temporary());
+			ibuf_update_free_bits_zip(block, mtr);
+		} else if (!index->table->is_temporary()) {
+			ibuf_update_free_bits_low(block, max_ins_size, mtr);
+		}
+	}
+
+	if (err != DB_SUCCESS) {
+		/* prefetch siblings of the leaf for the pessimistic
+		operation. */
+		btr_cur_prefetch_siblings(block, index);
+	}
+
+	return(err);
+}
+
+/*************************************************************//**
+If, in a split, a new supremum record was created as the predecessor of the
+updated record, the supremum record must inherit exactly the locks on the
+updated record. In the split it may have inherited locks from the successor
+of the updated record, which is not correct. This function restores the
+right locks for the new supremum. */
+static
+void
+btr_cur_pess_upd_restore_supremum(
+/*==============================*/
+	buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*	rec,	/*!< in: updated record */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*		page;
+	buf_block_t*	prev_block;
+
+	page = buf_block_get_frame(block);
+
+	if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
+		/* Updated record is not the first user record on its page */
+
+		return;
+	}
+
+	const uint32_t	prev_page_no = btr_page_get_prev(page);
+
+	const page_id_t	page_id(block->page.id().space(), prev_page_no);
+
+	ut_ad(prev_page_no != FIL_NULL);
+	prev_block = buf_page_get_with_no_latch(page_id, block->zip_size(),
+						mtr);
+#ifdef UNIV_BTR_DEBUG
+	ut_a(btr_page_get_next(prev_block->frame)
+	     == block->page.id().page_no());
+#endif /* UNIV_BTR_DEBUG */
+
+	/* We must already have an x-latch on prev_block! */
+	ut_ad(mtr->memo_contains_flagged(prev_block, MTR_MEMO_PAGE_X_FIX));
+
+	lock_rec_reset_and_inherit_gap_locks(prev_block, block,
+					     PAGE_HEAP_NO_SUPREMUM,
+					     page_rec_get_heap_no(rec));
+}
+
+/*************************************************************//**
+Performs an update of a record on a page of a tree. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. If the
+update is made on the leaf level, to avoid deadlocks, mtr must also
+own x-latches to brothers of page, if those brothers exist. We assume
+here that the ordering fields of the record do not change.
+@return DB_SUCCESS or error code */
+dberr_t
+btr_cur_pessimistic_update(
+/*=======================*/
+	ulint		flags,	/*!< in: undo logging, locking, and rollback
+				flags */
+	btr_cur_t*	cursor,	/*!< in/out: cursor on the record to update;
+				cursor may become invalid if *big_rec == NULL
+				|| !(flags & BTR_KEEP_POS_FLAG) */
+	rec_offs**	offsets,/*!< out: offsets on cursor->page_cur.rec */
+	mem_heap_t**	offsets_heap,
+				/*!< in/out: pointer to memory heap
+				that can be emptied */
+	mem_heap_t*	entry_heap,
+				/*!< in/out: memory heap for allocating
+				big_rec and the index tuple */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller */
+	upd_t*		update,	/*!< in/out: update vector; this is allowed to
+				also contain trx id and roll ptr fields.
+				Non-updated columns that are moved offpage will
+				be appended to this. */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; must be
+				committed before latching any further pages */
+{
+	big_rec_t*	big_rec_vec	= NULL;
+	big_rec_t*	dummy_big_rec;
+	dict_index_t*	index;
+	buf_block_t*	block;
+	page_zip_des_t*	page_zip;
+	rec_t*		rec;
+	page_cur_t*	page_cursor;
+	dberr_t		err;
+	dberr_t		optim_err;
+	roll_ptr_t	roll_ptr;
+	bool		was_first;
+	uint32_t	n_reserved	= 0;
+
+	*offsets = NULL;
+	*big_rec = NULL;
+
+	block = btr_cur_get_block(cursor);
+	page_zip = buf_block_get_page_zip(block);
+	index = cursor->index;
+
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK |
+					 MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, block->frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+	ut_ad(!page_zip || !index->table->is_temporary());
+	/* The insert buffer tree should never be updated in place. */
+	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
+	      || index->table->is_temporary());
+	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
+	      || dict_index_is_clust(index));
+	ut_ad(thr_get_trx(thr)->id == trx_id
+	      || (flags & ulint(~BTR_KEEP_POS_FLAG))
+	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
+		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
+
+	err = optim_err = btr_cur_optimistic_update(
+		flags | BTR_KEEP_IBUF_BITMAP,
+		cursor, offsets, offsets_heap, update,
+		cmpl_info, thr, trx_id, mtr);
+
+	switch (err) {
+	case DB_ZIP_OVERFLOW:
+	case DB_UNDERFLOW:
+	case DB_OVERFLOW:
+		break;
+	default:
+	err_exit:
+		/* We suppressed this with BTR_KEEP_IBUF_BITMAP.
+		For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were
+		already reset by btr_cur_update_alloc_zip() if the
+		page was recompressed. */
+		if (page_zip
+		    && optim_err != DB_ZIP_OVERFLOW
+		    && !dict_index_is_clust(index)
+		    && page_is_leaf(block->frame)) {
+			ut_ad(!index->table->is_temporary());
+			ibuf_update_free_bits_zip(block, mtr);
+		}
+
+		if (big_rec_vec != NULL) {
+			dtuple_big_rec_free(big_rec_vec);
+		}
+
+		return(err);
+	}
+
+	rec = btr_cur_get_rec(cursor);
+	ut_ad(rec_offs_validate(rec, index, *offsets));
+
+	dtuple_t* new_entry;
+
+	const bool is_metadata = rec_is_metadata(rec, *index);
+
+	if (UNIV_UNLIKELY(is_metadata)) {
+		ut_ad(update->is_metadata());
+		ut_ad(flags & BTR_NO_LOCKING_FLAG);
+		ut_ad(index->is_instant());
+		new_entry = row_metadata_to_tuple(
+			rec, index, *offsets, entry_heap,
+			update->info_bits, !thr_get_trx(thr)->in_rollback);
+		ut_ad(new_entry->n_fields
+		      == ulint(index->n_fields)
+		      + update->is_alter_metadata());
+	} else {
+		new_entry = row_rec_to_index_entry(rec, index, *offsets,
+						   entry_heap);
+	}
+
+	/* The page containing the clustered index record
+	corresponding to new_entry is latched in mtr.  If the
+	clustered index record is delete-marked, then its externally
+	stored fields cannot have been purged yet, because then the
+	purge would also have removed the clustered index record
+	itself.  Thus the following call is safe. */
+	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
+						     entry_heap);
+	btr_cur_trim(new_entry, index, update, thr);
+
+	/* We have to set appropriate extern storage bits in the new
+	record to be inserted: we have to remember which fields were such */
+
+	ut_ad(!page_is_comp(block->frame) || !rec_get_node_ptr_flag(rec));
+	ut_ad(rec_offs_validate(rec, index, *offsets));
+
+	if ((flags & BTR_NO_UNDO_LOG_FLAG)
+	    && rec_offs_any_extern(*offsets)) {
+		/* We are in a transaction rollback undoing a row
+		update: we must free possible externally stored fields
+		which got new values in the update, if they are not
+		inherited values. They can be inherited if we have
+		updated the primary key to another value, and then
+		update it back again. */
+
+		ut_ad(big_rec_vec == NULL);
+		ut_ad(dict_index_is_clust(index));
+		ut_ad(thr_get_trx(thr)->in_rollback);
+
+		DEBUG_SYNC_C("blob_rollback_middle");
+
+		btr_rec_free_updated_extern_fields(
+			index, rec, block, *offsets, update, true, mtr);
+	}
+
+	ulint n_ext = index->is_primary() ? dtuple_get_n_ext(new_entry) : 0;
+
+	if (page_zip_rec_needs_ext(
+		    rec_get_converted_size(index, new_entry, n_ext),
+		    page_is_comp(block->frame),
+		    dict_index_get_n_fields(index),
+		    block->zip_size())
+	    || (UNIV_UNLIKELY(update->is_alter_metadata())
+		&& !dfield_is_ext(dtuple_get_nth_field(
+					  new_entry,
+					  index->first_user_field())))) {
+		big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext);
+		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
+
+			/* We cannot goto return_after_reservations,
+			because we may need to update the
+			IBUF_BITMAP_FREE bits, which was suppressed by
+			BTR_KEEP_IBUF_BITMAP. */
+#ifdef UNIV_ZIP_DEBUG
+			ut_a(!page_zip
+			     || page_zip_validate(page_zip, block->frame,
+						  index));
+#endif /* UNIV_ZIP_DEBUG */
+			index->table->space->release_free_extents(n_reserved);
+			err = DB_TOO_BIG_RECORD;
+			goto err_exit;
+		}
+
+		ut_ad(page_is_leaf(block->frame));
+		ut_ad(dict_index_is_clust(index));
+		ut_ad(flags & BTR_KEEP_POS_FLAG);
+	}
+
+	/* Do lock checking and undo logging */
+	err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
+					update, cmpl_info,
+					thr, mtr, &roll_ptr);
+	if (err != DB_SUCCESS) {
+		goto err_exit;
+	}
+
+	if (optim_err == DB_OVERFLOW) {
+
+		/* First reserve enough free space for the file segments
+		of the index tree, so that the update will not fail because
+		of lack of space */
+
+		uint32_t n_extents = uint32_t(cursor->tree_height / 16 + 3);
+
+		if (!fsp_reserve_free_extents(
+		            &n_reserved, index->table->space, n_extents,
+		            flags & BTR_NO_UNDO_LOG_FLAG
+		            ? FSP_CLEANING : FSP_NORMAL,
+		            mtr)) {
+			err = DB_OUT_OF_FILE_SPACE;
+			goto err_exit;
+		}
+	}
+
+	if (!(flags & BTR_KEEP_SYS_FLAG)) {
+		btr_cur_write_sys(new_entry, index, trx_id, roll_ptr);
+	}
+
+	const ulint max_ins_size = page_zip
+		? 0 : page_get_max_insert_size_after_reorganize(block->frame,
+								1);
+
+	if (UNIV_UNLIKELY(is_metadata)) {
+		ut_ad(new_entry->is_metadata());
+		ut_ad(index->is_instant());
+		/* This can be innobase_add_instant_try() performing a
+		subsequent instant ALTER TABLE, or its rollback by
+		row_undo_mod_clust_low(). */
+		ut_ad(flags & BTR_NO_LOCKING_FLAG);
+	} else {
+		btr_search_update_hash_on_delete(cursor);
+
+		/* Store state of explicit locks on rec on the page
+		infimum record, before deleting rec. The page infimum
+		acts as a dummy carrier of the locks, taking care also
+		of lock releases, before we can move the locks back on
+		the actual record. There is a special case: if we are
+		inserting on the root page and the insert causes a
+		call of btr_root_raise_and_insert. Therefore we cannot
+		in the lock system delete the lock structs set on the
+		root page even if the root page carries just node
+		pointers. */
+		if (!dict_table_is_locking_disabled(index->table)) {
+			lock_rec_store_on_page_infimum(block, rec);
+		}
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, block->frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	page_cur_delete_rec(page_cursor, index, *offsets, mtr);
+
+	page_cur_move_to_prev(page_cursor);
+
+	rec = btr_cur_insert_if_possible(cursor, new_entry,
+					 offsets, offsets_heap, n_ext, mtr);
+
+	if (rec) {
+		page_cursor->rec = rec;
+
+		if (UNIV_UNLIKELY(is_metadata)) {
+			/* We must empty the PAGE_FREE list, because if this
+			was a rollback, the shortened metadata record
+			would have too many fields, and we would be unable to
+			know the size of the freed record. */
+			btr_page_reorganize(page_cursor, index, mtr);
+			rec = page_cursor->rec;
+			rec_offs_make_valid(rec, index, true, *offsets);
+			if (page_cursor->block->page.id().page_no()
+			    == index->page) {
+				btr_set_instant(page_cursor->block, *index,
+						mtr);
+			}
+		} else if (!dict_table_is_locking_disabled(index->table)) {
+			lock_rec_restore_from_page_infimum(
+				btr_cur_get_block(cursor), rec, block);
+		}
+
+		if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))
+		    || rec_is_alter_metadata(rec, *index)) {
+			/* The new inserted record owns its possible externally
+			stored fields */
+			btr_cur_unmark_extern_fields(btr_cur_get_block(cursor),
+						     rec, index, *offsets, mtr);
+		} else {
+			/* In delete-marked records, DB_TRX_ID must
+			always refer to an existing undo log record. */
+			ut_ad(row_get_rec_trx_id(rec, index, *offsets));
+		}
+
+		bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);
+		ut_ad(!adjust || page_is_leaf(block->frame));
+
+		if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
+			if (adjust) {
+				rec_offs_make_valid(page_cursor->rec, index,
+						    true, *offsets);
+			}
+		} else if (!dict_index_is_clust(index)
+			   && page_is_leaf(block->frame)) {
+			/* Update the free bits in the insert buffer.
+			This is the same block which was skipped by
+			BTR_KEEP_IBUF_BITMAP. */
+			if (page_zip) {
+				ut_ad(!index->table->is_temporary());
+				ibuf_update_free_bits_zip(block, mtr);
+			} else if (!index->table->is_temporary()) {
+				ibuf_update_free_bits_low(block, max_ins_size,
+							  mtr);
+			}
+		}
+
+		if (!srv_read_only_mode
+		    && !big_rec_vec
+		    && page_is_leaf(block->frame)
+		    && !dict_index_is_online_ddl(index)) {
+
+			mtr_memo_release(mtr, dict_index_get_lock(index),
+					 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
+
+			/* NOTE: We cannot release root block latch here, because it
+			has segment header and already modified in most of cases.*/
+		}
+
+		err = DB_SUCCESS;
+		goto return_after_reservations;
+	} else {
+		/* If the page is compressed and it initially
+		compresses very well, and there is a subsequent insert
+		of a badly-compressing record, it is possible for
+		btr_cur_optimistic_update() to return DB_UNDERFLOW and
+		btr_cur_insert_if_possible() to return FALSE. */
+		ut_a(page_zip || optim_err != DB_UNDERFLOW);
+
+		/* Out of space: reset the free bits.
+		This is the same block which was skipped by
+		BTR_KEEP_IBUF_BITMAP. */
+		if (!dict_index_is_clust(index)
+		    && !index->table->is_temporary()
+		    && page_is_leaf(block->frame)) {
+			ibuf_reset_free_bits(block);
+		}
+	}
+
+	if (big_rec_vec != NULL) {
+		ut_ad(page_is_leaf(block->frame));
+		ut_ad(dict_index_is_clust(index));
+		ut_ad(flags & BTR_KEEP_POS_FLAG);
+
+		/* btr_page_split_and_insert() in
+		btr_cur_pessimistic_insert() invokes
+		mtr_memo_release(mtr, index->lock, MTR_MEMO_SX_LOCK).
+		We must keep the index->lock when we created a
+		big_rec, so that row_upd_clust_rec() can store the
+		big_rec in the same mini-transaction. */
+
+		ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+						 | MTR_MEMO_SX_LOCK));
+		mtr_sx_lock_index(index, mtr);
+	}
+
+	/* Was the record to be updated positioned as the first user
+	record on its page? */
+	was_first = page_cur_is_before_first(page_cursor);
+
+	/* Lock checks and undo logging were already performed by
+	btr_cur_upd_lock_and_undo(). We do not try
+	btr_cur_optimistic_insert() because
+	btr_cur_insert_if_possible() already failed above. */
+
+	err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
+					 | BTR_NO_LOCKING_FLAG
+					 | BTR_KEEP_SYS_FLAG,
+					 cursor, offsets, offsets_heap,
+					 new_entry, &rec,
+					 &dummy_big_rec, n_ext, NULL, mtr);
+	ut_a(rec);
+	ut_a(err == DB_SUCCESS);
+	ut_a(dummy_big_rec == NULL);
+	ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
+	page_cursor->rec = rec;
+
+	/* Multiple transactions cannot simultaneously operate on the
+	same temp-table in parallel.
+	max_trx_id is ignored for temp tables because it not required
+	for MVCC. */
+	if (dict_index_is_sec_or_ibuf(index)
+	    && !index->table->is_temporary()) {
+		/* Update PAGE_MAX_TRX_ID in the index page header.
+		It was not updated by btr_cur_pessimistic_insert()
+		because of BTR_NO_LOCKING_FLAG. */
+		page_update_max_trx_id(btr_cur_get_block(cursor),
+				       btr_cur_get_page_zip(cursor),
+				       trx_id, mtr);
+	}
+
+	if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
+		/* The new inserted record owns its possible externally
+		stored fields */
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(page_zip, block->frame,
+						    index));
+#endif /* UNIV_ZIP_DEBUG */
+		btr_cur_unmark_extern_fields(btr_cur_get_block(cursor), rec,
+					     index, *offsets, mtr);
+	} else {
+		/* In delete-marked records, DB_TRX_ID must
+		always refer to an existing undo log record. */
+		ut_ad(row_get_rec_trx_id(rec, index, *offsets));
+	}
+
+	if (UNIV_UNLIKELY(is_metadata)) {
+		/* We must empty the PAGE_FREE list, because if this
+		was a rollback, the shortened metadata record
+		would have too many fields, and we would be unable to
+		know the size of the freed record. */
+		btr_page_reorganize(page_cursor, index, mtr);
+		rec = page_cursor->rec;
+	} else if (!dict_table_is_locking_disabled(index->table)) {
+		lock_rec_restore_from_page_infimum(
+			btr_cur_get_block(cursor), rec, block);
+	}
+
+	/* If necessary, restore also the correct lock state for a new,
+	preceding supremum record created in a page split. While the old
+	record was nonexistent, the supremum might have inherited its locks
+	from a wrong record. */
+
+	if (!was_first && !dict_table_is_locking_disabled(index->table)) {
+		btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
+						  rec, mtr);
+	}
+
+return_after_reservations:
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(btr_cur_get_page_zip(cursor),
+					    btr_cur_get_page(cursor), index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	index->table->space->release_free_extents(n_reserved);
+	*big_rec = big_rec_vec;
+	return(err);
+}
+
+/*==================== B-TREE DELETE MARK AND UNMARK ===============*/
+
+/** Modify the delete-mark flag of a record.
+@tparam         flag    the value of the delete-mark flag
+@param[in,out]  block   buffer block
+@param[in,out]  rec     record on a physical index page
+@param[in,out]  mtr     mini-transaction  */
+template<bool flag>
+void btr_rec_set_deleted(buf_block_t *block, rec_t *rec, mtr_t *mtr)
+{
+  if (page_rec_is_comp(rec))
+  {
+    byte *b= &rec[-REC_NEW_INFO_BITS];
+    const byte v= flag
+      ? (*b | REC_INFO_DELETED_FLAG)
+      : (*b & byte(~REC_INFO_DELETED_FLAG));
+    if (*b == v);
+    else if (UNIV_LIKELY_NULL(block->page.zip.data))
+    {
+      *b= v;
+      page_zip_rec_set_deleted(block, rec, flag, mtr);
+    }
+    else
+      mtr->write<1>(*block, b, v);
+  }
+  else
+  {
+    ut_ad(!block->page.zip.data);
+    byte *b= &rec[-REC_OLD_INFO_BITS];
+    const byte v = flag
+      ? (*b | REC_INFO_DELETED_FLAG)
+      : (*b & byte(~REC_INFO_DELETED_FLAG));
+    mtr->write<1,mtr_t::MAYBE_NOP>(*block, b, v);
+  }
+}
+
+template void btr_rec_set_deleted<false>(buf_block_t *, rec_t *, mtr_t *);
+template void btr_rec_set_deleted<true>(buf_block_t *, rec_t *, mtr_t *);
+
+/***********************************************************//**
+Marks a clustered index record deleted. Writes an undo log record to
+undo log on this delete marking. Writes in the trx id field the id
+of the deleting transaction, and in the roll ptr field pointer to the
+undo log record created.
+@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
+dberr_t
+btr_cur_del_mark_set_clust_rec(
+/*===========================*/
+	buf_block_t*	block,	/*!< in/out: buffer block of the record */
+	rec_t*		rec,	/*!< in/out: record */
+	dict_index_t*	index,	/*!< in: clustered index of the record */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec) */
+	que_thr_t*	thr,	/*!< in: query thread */
+	const dtuple_t*	entry,	/*!< in: dtuple for the deleting record, also
+				contains the virtual cols if there are any */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	roll_ptr_t	roll_ptr;
+	dberr_t		err;
+	trx_t*		trx;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+	ut_ad(buf_block_get_frame(block) == page_align(rec));
+	ut_ad(page_rec_is_leaf(rec));
+	ut_ad(mtr->is_named_space(index->table->space));
+
+	if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
+		/* We may already have delete-marked this record
+		when executing an ON DELETE CASCADE operation. */
+		ut_ad(row_get_rec_trx_id(rec, index, offsets)
+		      == thr_get_trx(thr)->id);
+		return(DB_SUCCESS);
+	}
+
+	err = lock_clust_rec_modify_check_and_lock(BTR_NO_LOCKING_FLAG, block,
+						   rec, index, offsets, thr);
+
+	if (err != DB_SUCCESS) {
+
+		return(err);
+	}
+
+	err = trx_undo_report_row_operation(thr, index,
+					    entry, NULL, 0, rec, offsets,
+					    &roll_ptr);
+	if (err != DB_SUCCESS) {
+
+		return(err);
+	}
+
+	/* The search latch is not needed here, because
+	the adaptive hash index does not depend on the delete-mark
+	and the delete-mark is being updated in place. */
+
+	btr_rec_set_deleted<true>(block, rec, mtr);
+
+	trx = thr_get_trx(thr);
+
+	DBUG_LOG("ib_cur",
+		 "delete-mark clust " << index->table->name
+		 << " (" << index->id << ") by "
+		 << ib::hex(trx_get_id_for_print(trx)) << ": "
+		 << rec_printer(rec, offsets).str());
+
+	if (dict_index_is_online_ddl(index)) {
+		row_log_table_delete(rec, index, offsets, NULL);
+	}
+
+	btr_cur_upd_rec_sys(block, rec, index, offsets, trx, roll_ptr, mtr);
+	return(err);
+}
+
+/*==================== B-TREE RECORD REMOVE =========================*/
+
+/*************************************************************//**
+Tries to compress a page of the tree if it seems useful. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. To avoid
+deadlocks, mtr must also own x-latches to brothers of page, if those
+brothers exist. NOTE: it is assumed that the caller has reserved enough
+free extents so that the compression will always succeed if done!
+@return TRUE if compression occurred */
+ibool
+btr_cur_compress_if_useful(
+/*=======================*/
+	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to compress;
+				cursor does not stay valid if !adjust and
+				compression occurs */
+	ibool		adjust,	/*!< in: TRUE if should adjust the
+				cursor position even if compression occurs */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(mtr->memo_contains_flagged(&cursor->index->lock,
+					 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+					 MTR_MEMO_PAGE_X_FIX));
+
+	if (cursor->index->is_spatial()) {
+		const trx_t*	trx = cursor->rtr_info->thr
+			? thr_get_trx(cursor->rtr_info->thr)
+			: NULL;
+		const buf_block_t* block = btr_cur_get_block(cursor);
+
+		/* Check whether page lock prevents the compression */
+		if (!lock_test_prdt_page_lock(trx, block->page.id())) {
+			return(false);
+		}
+	}
+
+	return(btr_cur_compress_recommendation(cursor, mtr)
+	       && btr_compress(cursor, adjust, mtr));
+}
+
+/*******************************************************//**
+Removes the record on which the tree cursor is positioned on a leaf page.
+It is assumed that the mtr has an x-latch on the page where the cursor is
+positioned, but no latch on the whole tree.
+@return TRUE if success, i.e., the page did not become too empty */
+ibool
+btr_cur_optimistic_delete_func(
+/*===========================*/
+	btr_cur_t*	cursor,	/*!< in: cursor on leaf page, on the record to
+				delete; cursor stays valid: if deletion
+				succeeds, on function exit it points to the
+				successor of the deleted record */
+#ifdef UNIV_DEBUG
+	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
+#endif /* UNIV_DEBUG */
+	mtr_t*		mtr)	/*!< in: mtr; if this function returns
+				TRUE on a leaf page of a secondary
+				index, the mtr must be committed
+				before latching any further pages */
+{
+	buf_block_t*	block;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
+	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+					 MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->is_named_space(cursor->index->table->space));
+	ut_ad(!cursor->index->is_dummy);
+
+	/* This is intended only for leaf page deletions */
+
+	block = btr_cur_get_block(cursor);
+
+	ut_ad(block->page.id().space() == cursor->index->table->space->id);
+	ut_ad(page_is_leaf(buf_block_get_frame(block)));
+	ut_ad(!dict_index_is_online_ddl(cursor->index)
+	      || dict_index_is_clust(cursor->index)
+	      || (flags & BTR_CREATE_FLAG));
+
+	rec = btr_cur_get_rec(cursor);
+
+	offsets = rec_get_offsets(rec, cursor->index, offsets,
+				  cursor->index->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+
+	const ibool no_compress_needed = !rec_offs_any_extern(offsets)
+		&& btr_cur_can_delete_without_compress(
+			cursor, rec_offs_size(offsets), mtr);
+
+	if (!no_compress_needed) {
+		/* prefetch siblings of the leaf for the pessimistic
+		operation. */
+		btr_cur_prefetch_siblings(block, cursor->index);
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(block->page.id().page_no() == cursor->index->page
+			  && page_get_n_recs(block->frame) == 1
+			  + (cursor->index->is_instant()
+			     && !rec_is_metadata(rec, *cursor->index))
+			  && !cursor->index->must_avoid_clear_instant_add())) {
+		/* The whole index (and table) becomes logically empty.
+		Empty the whole page. That is, if we are deleting the
+		only user record, also delete the metadata record
+		if one exists for instant ADD COLUMN (not generic ALTER TABLE).
+		If we are deleting the metadata record and the
+		table becomes empty, clean up the whole page. */
+		dict_index_t* index = cursor->index;
+		const rec_t* first_rec = page_rec_get_next_const(
+			page_get_infimum_rec(block->frame));
+		ut_ad(!index->is_instant()
+		      || rec_is_metadata(first_rec, *index));
+		const bool is_metadata = rec_is_metadata(rec, *index);
+		/* We can remove the metadata when rolling back an
+		instant ALTER TABLE operation, or when deleting the
+		last user record on the page such that only metadata for
+		instant ADD COLUMN (not generic ALTER TABLE) remains. */
+		const bool empty_table = is_metadata
+			|| !index->is_instant()
+			|| (first_rec != rec
+			    && rec_is_add_metadata(first_rec, *index));
+		if (UNIV_LIKELY(empty_table)) {
+			if (UNIV_LIKELY(!is_metadata)) {
+				lock_update_delete(block, rec);
+			}
+			btr_page_empty(block, buf_block_get_page_zip(block),
+				       index, 0, mtr);
+			if (index->is_instant()) {
+				/* MDEV-17383: free metadata BLOBs! */
+				index->clear_instant_alter();
+			}
+			page_cur_set_after_last(block,
+						btr_cur_get_page_cur(cursor));
+			goto func_exit;
+		}
+	}
+
+	{
+		page_t*		page	= buf_block_get_frame(block);
+		page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
+
+		if (UNIV_UNLIKELY(rec_get_info_bits(rec, page_rec_is_comp(rec))
+				  & REC_INFO_MIN_REC_FLAG)) {
+			/* This should be rolling back instant ADD COLUMN.
+			If this is a recovered transaction, then
+			index->is_instant() will hold until the
+			insert into SYS_COLUMNS is rolled back. */
+			ut_ad(cursor->index->table->supports_instant());
+			ut_ad(cursor->index->is_primary());
+			ut_ad(!page_zip);
+			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+					    cursor->index, offsets, mtr);
+			/* We must empty the PAGE_FREE list, because
+			after rollback, this deleted metadata record
+			would have too many fields, and we would be
+			unable to know the size of the freed record. */
+			btr_page_reorganize(btr_cur_get_page_cur(cursor),
+					    cursor->index, mtr);
+			goto func_exit;
+		} else {
+			lock_update_delete(block, rec);
+
+			btr_search_update_hash_on_delete(cursor);
+		}
+
+		if (page_zip) {
+#ifdef UNIV_ZIP_DEBUG
+			ut_a(page_zip_validate(page_zip, page, cursor->index));
+#endif /* UNIV_ZIP_DEBUG */
+			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+					    cursor->index, offsets, mtr);
+#ifdef UNIV_ZIP_DEBUG
+			ut_a(page_zip_validate(page_zip, page, cursor->index));
+#endif /* UNIV_ZIP_DEBUG */
+
+			/* On compressed pages, the IBUF_BITMAP_FREE
+			space is not affected by deleting (purging)
+			records, because it is defined as the minimum
+			of space available *without* reorganize, and
+			space available in the modification log. */
+		} else {
+			const ulint	max_ins
+				= page_get_max_insert_size_after_reorganize(
+					page, 1);
+
+			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+					    cursor->index, offsets, mtr);
+
+			/* The change buffer does not handle inserts
+			into non-leaf pages, into clustered indexes,
+			or into the change buffer. */
+			if (!dict_index_is_clust(cursor->index)
+			    && !cursor->index->table->is_temporary()
+			    && !dict_index_is_ibuf(cursor->index)) {
+				ibuf_update_free_bits_low(block, max_ins, mtr);
+			}
+		}
+	}
+
+func_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return(no_compress_needed);
+}
+
+/*************************************************************//**
+Removes the record on which the tree cursor is positioned. Tries
+to compress the page if its fillfactor drops below a threshold
+or if it is the only page on the level. It is assumed that mtr holds
+an x-latch on the tree and on the cursor page. To avoid deadlocks,
+mtr must also own x-latches to brothers of page, if those brothers
+exist.
+@return TRUE if compression occurred and FALSE if not or something
+wrong. */
+ibool
+btr_cur_pessimistic_delete(
+/*=======================*/
+	dberr_t*	err,	/*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
+				the latter may occur because we may have
+				to update node pointers on upper levels,
+				and in the case of variable length keys
+				these may actually grow in size */
+	ibool		has_reserved_extents, /*!< in: TRUE if the
+				caller has already reserved enough free
+				extents so that he knows that the operation
+				will succeed */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to delete;
+				if compression does not occur, the cursor
+				stays valid: it points to successor of
+				deleted record on function exit */
+	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
+	bool		rollback,/*!< in: performing rollback? */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	buf_block_t*	block;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	dict_index_t*	index;
+	rec_t*		rec;
+	uint32_t	n_reserved	= 0;
+	bool		success;
+	ibool		ret		= FALSE;
+	mem_heap_t*	heap;
+	rec_offs*	offsets;
+#ifdef UNIV_DEBUG
+	bool		parent_latched	= false;
+#endif /* UNIV_DEBUG */
+
+	block = btr_cur_get_block(cursor);
+	page = buf_block_get_frame(block);
+	index = btr_cur_get_index(cursor);
+
+	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
+	ut_ad(!dict_index_is_online_ddl(index)
+	      || dict_index_is_clust(index)
+	      || (flags & BTR_CREATE_FLAG));
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->is_named_space(index->table->space));
+	ut_ad(!index->is_dummy);
+	ut_ad(block->page.id().space() == index->table->space->id);
+
+	if (!has_reserved_extents) {
+		/* First reserve enough free space for the file segments
+		of the index tree, so that the node pointer updates will
+		not fail because of lack of space */
+
+		uint32_t n_extents = uint32_t(cursor->tree_height / 32 + 1);
+
+		success = fsp_reserve_free_extents(&n_reserved,
+						   index->table->space,
+						   n_extents,
+						   FSP_CLEANING, mtr);
+		if (!success) {
+			*err = DB_OUT_OF_FILE_SPACE;
+
+			return(FALSE);
+		}
+	}
+
+	heap = mem_heap_create(1024);
+	rec = btr_cur_get_rec(cursor);
+	page_zip = buf_block_get_page_zip(block);
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	offsets = rec_get_offsets(rec, index, NULL, page_is_leaf(page)
+				  ? index->n_core_fields : 0,
+				  ULINT_UNDEFINED, &heap);
+
+	if (rec_offs_any_extern(offsets)) {
+		btr_rec_free_externally_stored_fields(index,
+						      rec, offsets, block,
+						      rollback, mtr);
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+	}
+
+	rec_t* next_rec = NULL;
+	bool min_mark_next_rec = false;
+
+	if (page_is_leaf(page)) {
+		const bool is_metadata = rec_is_metadata(
+			rec, page_rec_is_comp(rec));
+		if (UNIV_UNLIKELY(is_metadata)) {
+			/* This should be rolling back instant ALTER TABLE.
+			If this is a recovered transaction, then
+			index->is_instant() will hold until the
+			insert into SYS_COLUMNS is rolled back. */
+			ut_ad(rollback);
+			ut_ad(index->table->supports_instant());
+			ut_ad(index->is_primary());
+		} else if (flags == 0) {
+			lock_update_delete(block, rec);
+		}
+
+		if (block->page.id().page_no() != index->page) {
+			if (page_get_n_recs(page) < 2) {
+				goto discard_page;
+			}
+		} else if (page_get_n_recs(page) == 1
+			   + (index->is_instant() && !is_metadata)
+			   && !index->must_avoid_clear_instant_add()) {
+			/* The whole index (and table) becomes logically empty.
+			Empty the whole page. That is, if we are deleting the
+			only user record, also delete the metadata record
+			if one exists for instant ADD COLUMN
+			(not generic ALTER TABLE).
+			If we are deleting the metadata record
+			(in the rollback of instant ALTER TABLE) and the
+			table becomes empty, clean up the whole page. */
+
+			const rec_t* first_rec = page_rec_get_next_const(
+				page_get_infimum_rec(page));
+			ut_ad(!index->is_instant()
+			      || rec_is_metadata(first_rec, *index));
+			if (is_metadata || !index->is_instant()
+			    || (first_rec != rec
+				&& rec_is_add_metadata(first_rec, *index))) {
+				btr_page_empty(block, page_zip, index, 0, mtr);
+				if (index->is_instant()) {
+					/* MDEV-17383: free metadata BLOBs! */
+					index->clear_instant_alter();
+				}
+				page_cur_set_after_last(
+					block,
+					btr_cur_get_page_cur(cursor));
+				ret = TRUE;
+				goto return_after_reservations;
+			}
+		}
+
+		if (UNIV_LIKELY(!is_metadata)) {
+			btr_search_update_hash_on_delete(cursor);
+		} else {
+			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+					    index, offsets, mtr);
+			/* We must empty the PAGE_FREE list, because
+			after rollback, this deleted metadata record
+			would carry too many fields, and we would be
+			unable to know the size of the freed record. */
+			btr_page_reorganize(btr_cur_get_page_cur(cursor),
+					    index, mtr);
+			ut_ad(!ret);
+			goto return_after_reservations;
+		}
+	} else if (UNIV_UNLIKELY(page_rec_is_first(rec, page))) {
+		if (page_rec_is_last(rec, page)) {
+discard_page:
+			ut_ad(page_get_n_recs(page) == 1);
+			/* If there is only one record, drop
+			the whole page. */
+
+			btr_discard_page(cursor, mtr);
+
+			ret = TRUE;
+			goto return_after_reservations;
+		}
+
+		next_rec = page_rec_get_next(rec);
+
+		if (!page_has_prev(page)) {
+			/* If we delete the leftmost node pointer on a
+			non-leaf level, we must mark the new leftmost node
+			pointer as the predefined minimum record */
+
+			min_mark_next_rec = true;
+		} else if (index->is_spatial()) {
+			/* For rtree, if delete the leftmost node pointer,
+			we need to update parent page. */
+			rtr_mbr_t	father_mbr;
+			rec_t*		father_rec;
+			btr_cur_t	father_cursor;
+			rec_offs*	offsets;
+			bool		upd_ret;
+			ulint		len;
+
+			rtr_page_get_father_block(NULL, heap, index,
+						  block, mtr, NULL,
+						  &father_cursor);
+			offsets = rec_get_offsets(
+				btr_cur_get_rec(&father_cursor), index, NULL,
+				0, ULINT_UNDEFINED, &heap);
+
+			father_rec = btr_cur_get_rec(&father_cursor);
+			rtr_read_mbr(rec_get_nth_field(
+				father_rec, offsets, 0, &len), &father_mbr);
+
+			upd_ret = rtr_update_mbr_field(&father_cursor, offsets,
+						       NULL, page, &father_mbr,
+						       next_rec, mtr);
+
+			if (!upd_ret) {
+				*err = DB_ERROR;
+
+				mem_heap_free(heap);
+				return(FALSE);
+			}
+
+			ut_d(parent_latched = true);
+		} else {
+			/* Otherwise, if we delete the leftmost node pointer
+			on a page, we have to change the parent node pointer
+			so that it is equal to the new leftmost node pointer
+			on the page */
+			btr_cur_t cursor;
+			btr_page_get_father(index, block, mtr, &cursor);
+			btr_cur_node_ptr_delete(&cursor, mtr);
+			const ulint	level = btr_page_get_level(page);
+			// FIXME: reuse the node_ptr from above
+			dtuple_t*	node_ptr = dict_index_build_node_ptr(
+				index, next_rec, block->page.id().page_no(),
+				heap, level);
+
+			btr_insert_on_non_leaf_level(
+				flags, index, level + 1, node_ptr, mtr);
+
+			ut_d(parent_latched = true);
+		}
+	}
+
+	/* SPATIAL INDEX never use SX locks; we can allow page merges
+	while holding X lock on the spatial index tree.
+	Do not allow merges of non-leaf B-tree pages unless it is
+	safe to do so. */
+	{
+		const bool allow_merge = page_is_leaf(page)
+			|| dict_index_is_spatial(index)
+			|| btr_cur_will_modify_tree(
+				index, page, BTR_INTENTION_DELETE, rec,
+				btr_node_ptr_max_size(index),
+				block->zip_size(), mtr);
+		page_cur_delete_rec(btr_cur_get_page_cur(cursor), index,
+				    offsets, mtr);
+
+		if (min_mark_next_rec) {
+			btr_set_min_rec_mark(next_rec, *block, mtr);
+		}
+
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+		ut_ad(!parent_latched
+		      || btr_check_node_ptr(index, block, mtr));
+
+		if (!ret && btr_cur_compress_recommendation(cursor, mtr)) {
+			if (UNIV_LIKELY(allow_merge)) {
+				ret = btr_cur_compress_if_useful(
+					cursor, FALSE, mtr);
+			} else {
+				ib::warn() << "Not merging page "
+					   << block->page.id()
+					   << " in index " << index->name
+					   << " of " << index->table->name;
+				ut_ad("MDEV-14637" == 0);
+			}
+		}
+	}
+
+return_after_reservations:
+	*err = DB_SUCCESS;
+
+	mem_heap_free(heap);
+
+	if (!srv_read_only_mode
+	    && page_is_leaf(page)
+	    && !dict_index_is_online_ddl(index)) {
+
+		mtr_memo_release(mtr, dict_index_get_lock(index),
+				 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
+
+		/* NOTE: We cannot release root block latch here, because it
+		has segment header and already modified in most of cases.*/
+	}
+
+	index->table->space->release_free_extents(n_reserved);
+	return(ret);
+}
+
+/** Delete the node pointer in a parent page.
+@param[in,out]	parent	cursor pointing to parent record
+@param[in,out]	mtr	mini-transaction */
+void btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr)
+{
+	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(parent),
+					 MTR_MEMO_PAGE_X_FIX));
+	dberr_t err;
+	ibool compressed = btr_cur_pessimistic_delete(&err, TRUE, parent,
+						      BTR_CREATE_FLAG, false,
+						      mtr);
+	ut_a(err == DB_SUCCESS);
+	if (!compressed) {
+		btr_cur_compress_if_useful(parent, FALSE, mtr);
+	}
+}
+
+/*******************************************************************//**
+Adds path information to the cursor for the current page, for which
+the binary search has been performed. */
+static
+void
+btr_cur_add_path_info(
+/*==================*/
+	btr_cur_t*	cursor,		/*!< in: cursor positioned on a page */
+	ulint		height,		/*!< in: height of the page in tree;
+					0 means leaf node */
+	ulint		root_height)	/*!< in: root node height in tree */
+{
+	btr_path_t*	slot;
+
+	ut_a(cursor->path_arr);
+
+	if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
+		/* Do nothing; return empty path */
+
+		slot = cursor->path_arr;
+		slot->nth_rec = ULINT_UNDEFINED;
+
+		return;
+	}
+
+	if (height == 0) {
+		/* Mark end of slots for path */
+		slot = cursor->path_arr + root_height + 1;
+		slot->nth_rec = ULINT_UNDEFINED;
+	}
+
+	slot = cursor->path_arr + (root_height - height);
+
+	const buf_block_t* block = btr_cur_get_block(cursor);
+
+	slot->nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor));
+	slot->n_recs = page_get_n_recs(block->frame);
+	slot->page_no = block->page.id().page_no();
+	slot->page_level = btr_page_get_level(block->frame);
+}
+
+/*******************************************************************//**
+Estimate the number of rows between slot1 and slot2 for any level on a
+B-tree. This function starts from slot1->page and reads a few pages to
+the right, counting their records. If we reach slot2->page quickly then
+we know exactly how many records there are between slot1 and slot2 and
+we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly
+then we calculate the average number of records in the pages scanned
+so far and assume that all pages that we did not scan up to slot2->page
+contain the same number of records, then we multiply that average to
+the number of pages between slot1->page and slot2->page (which is
+n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE.
+@return number of rows, not including the borders (exact or estimated) */
+static
+ha_rows
+btr_estimate_n_rows_in_range_on_level(
+/*==================================*/
+	dict_index_t*	index,			/*!< in: index */
+	btr_path_t*	slot1,			/*!< in: left border */
+	btr_path_t*	slot2,			/*!< in: right border */
+	ha_rows		n_rows_on_prev_level,	/*!< in: number of rows
+						on the previous level for the
+						same descend paths; used to
+						determine the number of pages
+						on this level */
+	bool*		is_n_rows_exact)	/*!< out: TRUE if the returned
+						value is exact i.e. not an
+						estimation */
+{
+	ha_rows		n_rows = 0;
+	uint		n_pages_read = 0;
+	ulint		level;
+
+	/* Assume by default that we will scan all pages between
+	slot1->page_no and slot2->page_no. */
+	*is_n_rows_exact = true;
+
+	/* Add records from slot1->page_no which are to the right of
+	the record which serves as a left border of the range, if any
+	(we don't include the record itself in this count). */
+	if (slot1->nth_rec <= slot1->n_recs) {
+		n_rows += slot1->n_recs - slot1->nth_rec;
+	}
+
+	/* Add records from slot2->page_no which are to the left of
+	the record which servers as a right border of the range, if any
+	(we don't include the record itself in this count). */
+	if (slot2->nth_rec > 1) {
+		n_rows += slot2->nth_rec - 1;
+	}
+
+	/* Count the records in the pages between slot1->page_no and
+	slot2->page_no (non inclusive), if any. */
+
+	/* Do not read more than this number of pages in order not to hurt
+	performance with this code which is just an estimation. If we read
+	this many pages before reaching slot2->page_no then we estimate the
+	average from the pages scanned so far. */
+#	define N_PAGES_READ_LIMIT	10
+
+	const fil_space_t*	space = index->table->space;
+	page_id_t		page_id(space->id, slot1->page_no);
+	const ulint		zip_size = space->zip_size();
+
+	level = slot1->page_level;
+
+	do {
+		mtr_t		mtr;
+		page_t*		page;
+		buf_block_t*	block;
+		dberr_t		err=DB_SUCCESS;
+
+		mtr_start(&mtr);
+
+		/* Fetch the page. Because we are not holding the
+		index->lock, the tree may have changed and we may be
+		attempting to read a page that is no longer part of
+		the B-tree. We pass BUF_GET_POSSIBLY_FREED in order to
+		silence a debug assertion about this. */
+		block = buf_page_get_gen(page_id, zip_size, RW_S_LATCH,
+					 NULL, BUF_GET_POSSIBLY_FREED,
+					 __FILE__, __LINE__, &mtr, &err);
+
+		ut_ad((block != NULL) == (err == DB_SUCCESS));
+
+		if (!block) {
+			if (err == DB_DECRYPTION_FAILED) {
+				ib_push_warning((void *)NULL,
+					DB_DECRYPTION_FAILED,
+					"Table %s is encrypted but encryption service or"
+					" used key_id is not available. "
+					" Can't continue reading table.",
+					index->table->name.m_name);
+				index->table->file_unreadable = true;
+			}
+
+			mtr_commit(&mtr);
+			goto inexact;
+		}
+
+		page = buf_block_get_frame(block);
+
+		/* It is possible that the tree has been reorganized in the
+		meantime and this is a different page. If this happens the
+		calculated estimate will be bogus, which is not fatal as
+		this is only an estimate. We are sure that a page with
+		page_no exists because InnoDB never frees pages, only
+		reuses them. */
+		if (!fil_page_index_page_check(page)
+		    || btr_page_get_index_id(page) != index->id
+		    || btr_page_get_level(page) != level) {
+
+			/* The page got reused for something else */
+			mtr_commit(&mtr);
+			goto inexact;
+		}
+
+		/* It is possible but highly unlikely that the page was
+		originally written by an old version of InnoDB that did
+		not initialize FIL_PAGE_TYPE on other than B-tree pages.
+		For example, this could be an almost-empty BLOB page
+		that happens to contain the magic values in the fields
+		that we checked above. */
+
+		n_pages_read++;
+
+		if (page_id.page_no() != slot1->page_no) {
+			/* Do not count the records on slot1->page_no,
+			we already counted them before this loop. */
+			n_rows += page_get_n_recs(page);
+		}
+
+		page_id.set_page_no(btr_page_get_next(page));
+
+		mtr_commit(&mtr);
+
+		if (n_pages_read == N_PAGES_READ_LIMIT
+		    || page_id.page_no() == FIL_NULL) {
+			/* Either we read too many pages or
+			we reached the end of the level without passing
+			through slot2->page_no, the tree must have changed
+			in the meantime */
+			goto inexact;
+		}
+
+	} while (page_id.page_no() != slot2->page_no);
+
+	return(n_rows);
+
+inexact:
+
+	*is_n_rows_exact = false;
+
+	/* We did interrupt before reaching slot2->page */
+
+	if (n_pages_read > 0) {
+		/* The number of pages on this level is
+		n_rows_on_prev_level, multiply it by the
+		average number of recs per page so far */
+		n_rows = n_rows_on_prev_level * n_rows / n_pages_read;
+	} else {
+		/* The tree changed before we could even
+		start with slot1->page_no */
+		n_rows = 10;
+	}
+
+	return(n_rows);
+}
+
+/** If the tree gets changed too much between the two dives for the left
+and right boundary then btr_estimate_n_rows_in_range_low() will retry
+that many times before giving up and returning the value stored in
+rows_in_range_arbitrary_ret_val. */
+static const unsigned	rows_in_range_max_retries = 4;
+
+/** We pretend that a range has that many records if the tree keeps changing
+for rows_in_range_max_retries retries while we try to estimate the records
+in a given range. */
+static const ha_rows	rows_in_range_arbitrary_ret_val = 10;
+
+/** Estimates the number of rows in a given index range.
+@param[in]	index		index
+@param[in]	tuple1		range start
+@param[in]	tuple2		range end
+@param[in]	nth_attempt	if the tree gets modified too much while
+we are trying to analyze it, then we will retry (this function will call
+itself, incrementing this parameter)
+@return estimated number of rows; if after rows_in_range_max_retries
+retries the tree keeps changing, then we will just return
+rows_in_range_arbitrary_ret_val as a result (if
+nth_attempt >= rows_in_range_max_retries and the tree is modified between
+the two dives). */
+static
+ha_rows
+btr_estimate_n_rows_in_range_low(
+	dict_index_t*	index,
+	btr_pos_t*	tuple1,
+	btr_pos_t*	tuple2,
+	unsigned	nth_attempt)
+{
+	btr_path_t	path1[BTR_PATH_ARRAY_N_SLOTS];
+	btr_path_t	path2[BTR_PATH_ARRAY_N_SLOTS];
+	btr_cur_t	cursor;
+	btr_path_t*	slot1;
+	btr_path_t*	slot2;
+	bool		diverged;
+	bool		diverged_lot;
+	ulint		divergence_level;
+	ha_rows		n_rows;
+	bool		is_n_rows_exact;
+	ulint		i;
+	mtr_t		mtr;
+	ha_rows		table_n_rows;
+        page_cur_mode_t mode2= tuple2->mode;
+
+	table_n_rows = dict_table_get_n_rows(index->table);
+
+	/* Below we dive to the two records specified by tuple1 and tuple2 and
+	we remember the entire dive paths from the tree root. The place where
+	the tuple1 path ends on the leaf level we call "left border" of our
+	interval and the place where the tuple2 path ends on the leaf level -
+	"right border". We take care to either include or exclude the interval
+	boundaries depending on whether <, <=, > or >= was specified. For
+	example if "5 < x AND x <= 10" then we should not include the left
+	boundary, but should include the right one. */
+
+	mtr_start(&mtr);
+
+	cursor.path_arr = path1;
+
+	bool	should_count_the_left_border;
+
+	if (dtuple_get_n_fields(tuple1->tuple) > 0) {
+
+              btr_cur_search_to_nth_level(index, 0, tuple1->tuple,
+                                            tuple1->mode,
+					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
+					    &cursor, 0,
+					    __FILE__, __LINE__, &mtr);
+
+		ut_ad(!page_rec_is_infimum(btr_cur_get_rec(&cursor)));
+
+		/* We should count the border if there are any records to
+		match the criteria, i.e. if the maximum record on the tree is
+		5 and x > 3 is specified then the cursor will be positioned at
+		5 and we should count the border, but if x > 7 is specified,
+		then the cursor will be positioned at 'sup' on the rightmost
+		leaf page in the tree and we should not count the border. */
+		should_count_the_left_border
+			= !page_rec_is_supremum(btr_cur_get_rec(&cursor));
+	} else {
+		dberr_t err = DB_SUCCESS;
+
+		err = btr_cur_open_at_index_side(true, index,
+					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
+					   &cursor, 0, &mtr);
+
+		if (err != DB_SUCCESS) {
+			ib::warn() << " Error code: " << err
+				   << " btr_estimate_n_rows_in_range_low "
+				   << " called from file: "
+				   << __FILE__ << " line: " << __LINE__
+				   << " table: " << index->table->name
+				   << " index: " << index->name;
+		}
+
+		ut_ad(page_rec_is_infimum(btr_cur_get_rec(&cursor)));
+
+		/* The range specified is wihout a left border, just
+		'x < 123' or 'x <= 123' and btr_cur_open_at_index_side()
+		positioned the cursor on the infimum record on the leftmost
+		page, which must not be counted. */
+		should_count_the_left_border = false;
+	}
+
+        tuple1->page_id= cursor.page_cur.block->page.id();
+
+	mtr_commit(&mtr);
+
+	if (!index->is_readable()) {
+		return 0;
+	}
+
+	mtr_start(&mtr);
+
+	cursor.path_arr = path2;
+
+	bool	should_count_the_right_border;
+
+	if (dtuple_get_n_fields(tuple2->tuple) > 0) {
+
+		btr_cur_search_to_nth_level(index, 0, tuple2->tuple,
+                                            mode2,
+					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
+					    &cursor, 0,
+					    __FILE__, __LINE__, &mtr);
+
+		const rec_t*	rec = btr_cur_get_rec(&cursor);
+
+		ut_ad(!(mode2 == PAGE_CUR_L && page_rec_is_supremum(rec)));
+
+		should_count_the_right_border
+			= (mode2 == PAGE_CUR_LE /* if the range is '<=' */
+			   /* and the record was found */
+			   && cursor.low_match >= dtuple_get_n_fields(tuple2->tuple))
+			|| (mode2 == PAGE_CUR_L /* or if the range is '<' */
+			    /* and there are any records to match the criteria,
+			    i.e. if the minimum record on the tree is 5 and
+			    x < 7 is specified then the cursor will be
+			    positioned at 5 and we should count the border, but
+			    if x < 2 is specified, then the cursor will be
+			    positioned at 'inf' and we should not count the
+			    border */
+			    && !page_rec_is_infimum(rec));
+		/* Notice that for "WHERE col <= 'foo'" MySQL passes to
+		ha_innobase::records_in_range():
+		min_key=NULL (left-unbounded) which is expected
+		max_key='foo' flag=HA_READ_AFTER_KEY (PAGE_CUR_G), which is
+		unexpected - one would expect
+		flag=HA_READ_KEY_OR_PREV (PAGE_CUR_LE). In this case the
+		cursor will be positioned on the first record to the right of
+		the requested one (can also be positioned on the 'sup') and
+		we should not count the right border. */
+	} else {
+		dberr_t err = DB_SUCCESS;
+
+		err = btr_cur_open_at_index_side(false, index,
+					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
+					   &cursor, 0, &mtr);
+
+		if (err != DB_SUCCESS) {
+			ib::warn() << " Error code: " << err
+				   << " btr_estimate_n_rows_in_range_low "
+				   << " called from file: "
+				   << __FILE__ << " line: " << __LINE__
+				   << " table: " << index->table->name
+				   << " index: " << index->name;
+		}
+
+		ut_ad(page_rec_is_supremum(btr_cur_get_rec(&cursor)));
+
+		/* The range specified is wihout a right border, just
+		'x > 123' or 'x >= 123' and btr_cur_open_at_index_side()
+		positioned the cursor on the supremum record on the rightmost
+		page, which must not be counted. */
+		should_count_the_right_border = false;
+	}
+
+        tuple2->page_id= cursor.page_cur.block->page.id();
+
+	mtr_commit(&mtr);
+
+	/* We have the path information for the range in path1 and path2 */
+
+	n_rows = 0;
+	is_n_rows_exact = true;
+
+	/* This becomes true when the two paths do not pass through the
+	same pages anymore. */
+	diverged = false;
+
+	/* This becomes true when the paths are not the same or adjacent
+	any more. This means that they pass through the same or
+	neighboring-on-the-same-level pages only. */
+	diverged_lot = false;
+
+	/* This is the level where paths diverged a lot. */
+	divergence_level = 1000000;
+
+	for (i = 0; ; i++) {
+		ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
+
+		slot1 = path1 + i;
+		slot2 = path2 + i;
+
+		if (slot1->nth_rec == ULINT_UNDEFINED
+		    || slot2->nth_rec == ULINT_UNDEFINED) {
+
+			/* Here none of the borders were counted. For example,
+			if on the leaf level we descended to:
+			(inf, a, b, c, d, e, f, sup)
+			         ^        ^
+			       path1    path2
+			then n_rows will be 2 (c and d). */
+
+			if (is_n_rows_exact) {
+				/* Only fiddle to adjust this off-by-one
+				if the number is exact, otherwise we do
+				much grosser adjustments below. */
+
+				btr_path_t*	last1 = &path1[i - 1];
+				btr_path_t*	last2 = &path2[i - 1];
+
+				/* If both paths end up on the same record on
+				the leaf level. */
+				if (last1->page_no == last2->page_no
+				    && last1->nth_rec == last2->nth_rec) {
+
+					/* n_rows can be > 0 here if the paths
+					were first different and then converged
+					to the same record on the leaf level.
+					For example:
+					SELECT ... LIKE 'wait/synch/rwlock%'
+					mode1=PAGE_CUR_GE,
+					tuple1="wait/synch/rwlock"
+					path1[0]={nth_rec=58, n_recs=58,
+						  page_no=3, page_level=1}
+					path1[1]={nth_rec=56, n_recs=55,
+						  page_no=119, page_level=0}
+
+					mode2=PAGE_CUR_G
+					tuple2="wait/synch/rwlock"
+					path2[0]={nth_rec=57, n_recs=57,
+						  page_no=3, page_level=1}
+					path2[1]={nth_rec=56, n_recs=55,
+						  page_no=119, page_level=0} */
+
+					/* If the range is such that we should
+					count both borders, then avoid
+					counting that record twice - once as a
+					left border and once as a right
+					border. */
+					if (should_count_the_left_border
+					    && should_count_the_right_border) {
+
+						n_rows = 1;
+					} else {
+						/* Some of the borders should
+						not be counted, e.g. [3,3). */
+						n_rows = 0;
+					}
+				} else {
+					if (should_count_the_left_border) {
+						n_rows++;
+					}
+
+					if (should_count_the_right_border) {
+						n_rows++;
+					}
+				}
+			}
+
+			if (i > divergence_level + 1 && !is_n_rows_exact) {
+				/* In trees whose height is > 1 our algorithm
+				tends to underestimate: multiply the estimate
+				by 2: */
+
+				n_rows = n_rows * 2;
+			}
+
+			DBUG_EXECUTE_IF("bug14007649", return(n_rows););
+
+			/* Do not estimate the number of rows in the range
+			to over 1 / 2 of the estimated rows in the whole
+			table */
+
+			if (n_rows > table_n_rows / 2 && !is_n_rows_exact) {
+
+				n_rows = table_n_rows / 2;
+
+				/* If there are just 0 or 1 rows in the table,
+				then we estimate all rows are in the range */
+
+				if (n_rows == 0) {
+					n_rows = table_n_rows;
+				}
+			}
+
+			return(n_rows);
+		}
+
+		if (!diverged && slot1->nth_rec != slot2->nth_rec) {
+
+			/* If both slots do not point to the same page,
+			this means that the tree must have changed between
+			the dive for slot1 and the dive for slot2 at the
+			beginning of this function. */
+			if (slot1->page_no != slot2->page_no
+			    || slot1->page_level != slot2->page_level) {
+
+				/* If the tree keeps changing even after a
+				few attempts, then just return some arbitrary
+				number. */
+				if (nth_attempt >= rows_in_range_max_retries) {
+					return(rows_in_range_arbitrary_ret_val);
+				}
+
+				return btr_estimate_n_rows_in_range_low(
+                                       index, tuple1, tuple2,
+                                       nth_attempt + 1);
+			}
+
+			diverged = true;
+
+			if (slot1->nth_rec < slot2->nth_rec) {
+				/* We do not count the borders (nor the left
+				nor the right one), thus "- 1". */
+				n_rows = slot2->nth_rec - slot1->nth_rec - 1;
+
+				if (n_rows > 0) {
+					/* There is at least one row between
+					the two borders pointed to by slot1
+					and slot2, so on the level below the
+					slots will point to non-adjacent
+					pages. */
+					diverged_lot = true;
+					divergence_level = i;
+				}
+			} else {
+				/* It is possible that
+				slot1->nth_rec >= slot2->nth_rec
+				if, for example, we have a single page
+				tree which contains (inf, 5, 6, supr)
+				and we select where x > 20 and x < 30;
+				in this case slot1->nth_rec will point
+				to the supr record and slot2->nth_rec
+				will point to 6. */
+				n_rows = 0;
+				should_count_the_left_border = false;
+				should_count_the_right_border = false;
+			}
+
+		} else if (diverged && !diverged_lot) {
+
+			if (slot1->nth_rec < slot1->n_recs
+			    || slot2->nth_rec > 1) {
+
+				diverged_lot = true;
+				divergence_level = i;
+
+				n_rows = 0;
+
+				if (slot1->nth_rec < slot1->n_recs) {
+					n_rows += slot1->n_recs
+						- slot1->nth_rec;
+				}
+
+				if (slot2->nth_rec > 1) {
+					n_rows += slot2->nth_rec - 1;
+				}
+			}
+		} else if (diverged_lot) {
+
+			n_rows = btr_estimate_n_rows_in_range_on_level(
+				index, slot1, slot2, n_rows,
+				&is_n_rows_exact);
+		}
+	}
+}
+
+/** Estimates the number of rows in a given index range.
+@param[in]	index	index
+@param[in]	tuple1	range start, may also be empty tuple
+@param[in]	mode1	search mode for range start
+@param[in]	tuple2	range end, may also be empty tuple
+@param[in]	mode2	search mode for range end
+@return estimated number of rows */
+ha_rows
+btr_estimate_n_rows_in_range(
+	dict_index_t*	index,
+        btr_pos_t       *tuple1,
+        btr_pos_t       *tuple2)
+{
+	return btr_estimate_n_rows_in_range_low(
+		index, tuple1, tuple2, 1);
+}
+
+/*******************************************************************//**
+Record the number of non_null key values in a given index for
+each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
+The estimates are eventually stored in the array:
+index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
+static
+void
+btr_record_not_null_field_in_rec(
+/*=============================*/
+	ulint		n_unique,	/*!< in: dict_index_get_n_unique(index),
+					number of columns uniquely determine
+					an index entry */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index),
+					its size could be for all fields or
+					that of "n_unique" */
+	ib_uint64_t*	n_not_null)	/*!< in/out: array to record number of
+					not null rows for n-column prefix */
+{
+	ulint	i;
+
+	ut_ad(rec_offs_n_fields(offsets) >= n_unique);
+
+	if (n_not_null == NULL) {
+		return;
+	}
+
+	for (i = 0; i < n_unique; i++) {
+		if (rec_offs_nth_sql_null(offsets, i)) {
+			break;
+		}
+
+		n_not_null[i]++;
+	}
+}
+
+/** Estimates the number of different key values in a given index, for
+each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
+The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
+0..n_uniq-1) and the number of pages that were sampled is saved in
+result.n_sample_sizes[].
+If innodb_stats_method is nulls_ignored, we also record the number of
+non-null values for each prefix and stored the estimates in
+array result.n_non_null_key_vals.
+@param[in]	index	index
+@return vector with statistics information
+empty vector if the index is unavailable. */
+std::vector<index_field_stats_t>
+btr_estimate_number_of_different_key_vals(dict_index_t* index)
+{
+	btr_cur_t	cursor;
+	page_t*		page;
+	rec_t*		rec;
+	ulint		n_cols;
+	ib_uint64_t*	n_diff;
+	ib_uint64_t*	n_not_null;
+	ibool		stats_null_not_equal;
+	uintmax_t	n_sample_pages=1; /* number of pages to sample */
+	ulint		not_empty_flag	= 0;
+	ulint		total_external_size = 0;
+	ulint		i;
+	ulint		j;
+	uintmax_t	add_on;
+	mtr_t		mtr;
+	mem_heap_t*	heap		= NULL;
+	rec_offs*	offsets_rec	= NULL;
+	rec_offs*	offsets_next_rec = NULL;
+
+	std::vector<index_field_stats_t> result;
+
+	/* For spatial index, there is no such stats can be
+	fetched. */
+	ut_ad(!dict_index_is_spatial(index));
+
+	n_cols = dict_index_get_n_unique(index);
+
+	heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
+			       * n_cols
+			       + dict_index_get_n_fields(index)
+			       * (sizeof *offsets_rec
+				  + sizeof *offsets_next_rec));
+
+	n_diff = (ib_uint64_t*) mem_heap_zalloc(
+		heap, n_cols * sizeof(n_diff[0]));
+
+	n_not_null = NULL;
+
+	/* Check srv_innodb_stats_method setting, and decide whether we
+	need to record non-null value and also decide if NULL is
+	considered equal (by setting stats_null_not_equal value) */
+	switch (srv_innodb_stats_method) {
+	case SRV_STATS_NULLS_IGNORED:
+		n_not_null = (ib_uint64_t*) mem_heap_zalloc(
+			heap, n_cols * sizeof *n_not_null);
+		/* fall through */
+
+	case SRV_STATS_NULLS_UNEQUAL:
+		/* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
+		case, we will treat NULLs as unequal value */
+		stats_null_not_equal = TRUE;
+		break;
+
+	case SRV_STATS_NULLS_EQUAL:
+		stats_null_not_equal = FALSE;
+		break;
+
+	default:
+		ut_error;
+	}
+
+	if (srv_stats_sample_traditional) {
+		/* It makes no sense to test more pages than are contained
+		in the index, thus we lower the number if it is too high */
+		if (srv_stats_transient_sample_pages > index->stat_index_size) {
+			if (index->stat_index_size > 0) {
+				n_sample_pages = index->stat_index_size;
+			}
+		} else {
+			n_sample_pages = srv_stats_transient_sample_pages;
+		}
+	} else {
+		/* New logaritmic number of pages that are estimated.
+		Number of pages estimated should be between 1 and
+		index->stat_index_size.
+
+		If we have only 0 or 1 index pages then we can only take 1
+		sample. We have already initialized n_sample_pages to 1.
+
+		So taking index size as I and sample as S and log(I)*S as L
+
+		requirement 1) we want the out limit of the expression to not exceed I;
+		requirement 2) we want the ideal pages to be at least S;
+		so the current expression is min(I, max( min(S,I), L)
+
+		looking for simplifications:
+
+		case 1: assume S < I
+		min(I, max( min(S,I), L) -> min(I , max( S, L))
+
+		but since L=LOG2(I)*S and log2(I) >=1   L>S always so max(S,L) = L.
+
+		so we have: min(I , L)
+
+		case 2: assume I < S
+		    min(I, max( min(S,I), L) -> min(I, max( I, L))
+
+		case 2a: L > I
+		    min(I, max( I, L)) -> min(I, L) -> I
+
+		case 2b: when L < I
+		    min(I, max( I, L))  ->  min(I, I ) -> I
+
+		so taking all case2 paths is I, our expression is:
+		n_pages = S < I? min(I,L) : I
+                */
+		if (index->stat_index_size > 1) {
+			n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size)
+				? ut_min(index->stat_index_size,
+					 static_cast<ulint>(
+						 log2(double(index->stat_index_size))
+						 * double(srv_stats_transient_sample_pages)))
+				: index->stat_index_size;
+		}
+	}
+
+	/* Sanity check */
+	ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size));
+
+	/* We sample some pages in the index to get an estimate */
+
+	for (i = 0; i < n_sample_pages; i++) {
+		mtr_start(&mtr);
+
+		bool	available;
+
+		available = btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF,
+						    &cursor, &mtr);
+
+		if (!available) {
+			mtr_commit(&mtr);
+			mem_heap_free(heap);
+
+			return result;
+		}
+
+		/* Count the number of different key values for each prefix of
+		the key on this index page. If the prefix does not determine
+		the index record uniquely in the B-tree, then we subtract one
+		because otherwise our algorithm would give a wrong estimate
+		for an index where there is just one key value. */
+
+		if (!index->is_readable()) {
+			mtr_commit(&mtr);
+			goto exit_loop;
+		}
+
+		page = btr_cur_get_page(&cursor);
+
+		rec = page_rec_get_next(page_get_infimum_rec(page));
+		const ulint n_core = page_is_leaf(page)
+			? index->n_core_fields : 0;
+
+		if (!page_rec_is_supremum(rec)) {
+			not_empty_flag = 1;
+			offsets_rec = rec_get_offsets(rec, index, offsets_rec,
+						      n_core,
+						      ULINT_UNDEFINED, &heap);
+
+			if (n_not_null != NULL) {
+				btr_record_not_null_field_in_rec(
+					n_cols, offsets_rec, n_not_null);
+			}
+		}
+
+		while (!page_rec_is_supremum(rec)) {
+			ulint	matched_fields;
+			rec_t*	next_rec = page_rec_get_next(rec);
+			if (page_rec_is_supremum(next_rec)) {
+				total_external_size +=
+					btr_rec_get_externally_stored_len(
+						rec, offsets_rec);
+				break;
+			}
+
+			offsets_next_rec = rec_get_offsets(next_rec, index,
+							   offsets_next_rec,
+							   n_core,
+							   ULINT_UNDEFINED,
+							   &heap);
+
+			cmp_rec_rec(rec, next_rec,
+				    offsets_rec, offsets_next_rec,
+				    index, stats_null_not_equal,
+				    &matched_fields);
+
+			for (j = matched_fields; j < n_cols; j++) {
+				/* We add one if this index record has
+				a different prefix from the previous */
+
+				n_diff[j]++;
+			}
+
+			if (n_not_null != NULL) {
+				btr_record_not_null_field_in_rec(
+					n_cols, offsets_next_rec, n_not_null);
+			}
+
+			total_external_size
+				+= btr_rec_get_externally_stored_len(
+					rec, offsets_rec);
+
+			rec = next_rec;
+			/* Initialize offsets_rec for the next round
+			and assign the old offsets_rec buffer to
+			offsets_next_rec. */
+			{
+				rec_offs* offsets_tmp = offsets_rec;
+				offsets_rec = offsets_next_rec;
+				offsets_next_rec = offsets_tmp;
+			}
+		}
+
+		if (n_cols == dict_index_get_n_unique_in_tree(index)
+		    && page_has_siblings(page)) {
+
+			/* If there is more than one leaf page in the tree,
+			we add one because we know that the first record
+			on the page certainly had a different prefix than the
+			last record on the previous index page in the
+			alphabetical order. Before this fix, if there was
+			just one big record on each clustered index page, the
+			algorithm grossly underestimated the number of rows
+			in the table. */
+
+			n_diff[n_cols - 1]++;
+		}
+
+		mtr_commit(&mtr);
+	}
+
+exit_loop:
+	/* If we saw k borders between different key values on
+	n_sample_pages leaf pages, we can estimate how many
+	there will be in index->stat_n_leaf_pages */
+
+	/* We must take into account that our sample actually represents
+	also the pages used for external storage of fields (those pages are
+	included in index->stat_n_leaf_pages) */
+
+	result.reserve(n_cols);
+
+	for (j = 0; j < n_cols; j++) {
+		index_field_stats_t stat;
+
+		stat.n_diff_key_vals
+			= BTR_TABLE_STATS_FROM_SAMPLE(
+				n_diff[j], index, n_sample_pages,
+				total_external_size, not_empty_flag);
+
+		/* If the tree is small, smaller than
+		10 * n_sample_pages + total_external_size, then
+		the above estimate is ok. For bigger trees it is common that we
+		do not see any borders between key values in the few pages
+		we pick. But still there may be n_sample_pages
+		different key values, or even more. Let us try to approximate
+		that: */
+
+		add_on = index->stat_n_leaf_pages
+			/ (10 * (n_sample_pages
+				 + total_external_size));
+
+		if (add_on > n_sample_pages) {
+			add_on = n_sample_pages;
+		}
+
+		stat.n_diff_key_vals += add_on;
+
+		stat.n_sample_sizes = n_sample_pages;
+
+		if (n_not_null != NULL) {
+			stat.n_non_null_key_vals =
+				 BTR_TABLE_STATS_FROM_SAMPLE(
+					n_not_null[j], index, n_sample_pages,
+					total_external_size, not_empty_flag);
+		}
+
+		result.push_back(stat);
+	}
+
+	mem_heap_free(heap);
+
+	return result;
+}
+
+/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
+
+/***********************************************************//**
+Gets the offset of the pointer to the externally stored part of a field.
+@return offset of the pointer to the externally stored part */
+static
+ulint
+btr_rec_get_field_ref_offs(
+/*=======================*/
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n)	/*!< in: index of the external field */
+{
+	ulint	field_ref_offs;
+	ulint	local_len;
+
+	ut_a(rec_offs_nth_extern(offsets, n));
+	field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len);
+	ut_a(len_is_stored(local_len));
+	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE);
+}
+
+/** Gets a pointer to the externally stored part of a field.
+@param rec record
+@param offsets rec_get_offsets(rec)
+@param n index of the externally stored field
+@return pointer to the externally stored part */
+#define btr_rec_get_field_ref(rec, offsets, n)			\
+	((rec) + btr_rec_get_field_ref_offs(offsets, n))
+
+/** Gets the externally stored size of a record, in units of a database page.
+@param[in]	rec	record
+@param[in]	offsets	array returned by rec_get_offsets()
+@return externally stored part, in units of a database page */
+ulint
+btr_rec_get_externally_stored_len(
+	const rec_t*	rec,
+	const rec_offs*	offsets)
+{
+	ulint	n_fields;
+	ulint	total_extern_len = 0;
+	ulint	i;
+
+	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+
+	if (!rec_offs_any_extern(offsets)) {
+		return(0);
+	}
+
+	n_fields = rec_offs_n_fields(offsets);
+
+	for (i = 0; i < n_fields; i++) {
+		if (rec_offs_nth_extern(offsets, i)) {
+
+			ulint	extern_len = mach_read_from_4(
+				btr_rec_get_field_ref(rec, offsets, i)
+				+ BTR_EXTERN_LEN + 4);
+
+			total_extern_len += ut_calc_align(
+				extern_len, ulint(srv_page_size));
+		}
+	}
+
+	return total_extern_len >> srv_page_size_shift;
+}
+
+/*******************************************************************//**
+Sets the ownership bit of an externally stored field in a record. */
+static
+void
+btr_cur_set_ownership_of_extern_field(
+/*==================================*/
+	buf_block_t*	block,	/*!< in/out: index page */
+	rec_t*		rec,	/*!< in/out: clustered index record */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		i,	/*!< in: field number */
+	bool		val,	/*!< in: value to set */
+	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
+{
+	byte*	data;
+	ulint	local_len;
+	ulint	byte_val;
+
+	data = rec_get_nth_field(rec, offsets, i, &local_len);
+	ut_ad(rec_offs_nth_extern(offsets, i));
+	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+	byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
+
+	if (val) {
+		byte_val &= ~BTR_EXTERN_OWNER_FLAG;
+	} else {
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+		ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+		byte_val |= BTR_EXTERN_OWNER_FLAG;
+	}
+
+	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+		mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
+		page_zip_write_blob_ptr(block, rec, index, offsets, i, mtr);
+	} else {
+		mtr->write<1,mtr_t::MAYBE_NOP>(*block, data + local_len
+					       + BTR_EXTERN_LEN, byte_val);
+	}
+}
+
+/*******************************************************************//**
+Marks non-updated off-page fields as disowned by this record. The ownership
+must be transferred to the updated record which is inserted elsewhere in the
+index tree. In purge only the owner of externally stored field is allowed
+to free the field. */
+void
+btr_cur_disown_inherited_fields(
+/*============================*/
+	buf_block_t*	block,	/*!< in/out: index page */
+	rec_t*		rec,	/*!< in/out: record in a clustered index */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	const upd_t*	update,	/*!< in: update vector */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+	ut_ad(rec_offs_any_extern(offsets));
+
+	for (uint16_t i = 0; i < rec_offs_n_fields(offsets); i++) {
+		if (rec_offs_nth_extern(offsets, i)
+		    && !upd_get_field_by_field_no(update, i, false)) {
+			btr_cur_set_ownership_of_extern_field(
+				block, rec, index, offsets, i, false, mtr);
+		}
+	}
+}
+
+/*******************************************************************//**
+Marks all extern fields in a record as owned by the record. This function
+should be called if the delete mark of a record is removed: a not delete
+marked record always owns all its extern fields. */
+static
+void
+btr_cur_unmark_extern_fields(
+/*=========================*/
+	buf_block_t*	block,	/*!< in/out: index page */
+	rec_t*		rec,	/*!< in/out: record in a clustered index */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
+{
+	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+	if (!rec_offs_any_extern(offsets)) {
+		return;
+	}
+
+	const ulint n = rec_offs_n_fields(offsets);
+
+	for (ulint i = 0; i < n; i++) {
+		if (rec_offs_nth_extern(offsets, i)) {
+			btr_cur_set_ownership_of_extern_field(
+				block, rec, index, offsets, i, true, mtr);
+		}
+	}
+}
+
+/*******************************************************************//**
+Returns the length of a BLOB part stored on the header page.
+@return part length */
+static
+uint32_t
+btr_blob_get_part_len(
+/*==================*/
+	const byte*	blob_header)	/*!< in: blob header */
+{
+	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
+}
+
+/*******************************************************************//**
+Returns the page number where the next BLOB part is stored.
+@return page number or FIL_NULL if no more pages */
+static
+uint32_t
+btr_blob_get_next_page_no(
+/*======================*/
+	const byte*	blob_header)	/*!< in: blob header */
+{
+	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
+}
+
+/** Deallocate a buffer block that was reserved for a BLOB part.
+@param block   buffer block
+@param all     flag whether to remove a ROW_FORMAT=COMPRESSED page
+@param mtr     mini-transaction to commit */
+static void btr_blob_free(buf_block_t *block, bool all, mtr_t *mtr)
+{
+  const page_id_t page_id(block->page.id());
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+  mtr->commit();
+
+  const ulint fold= page_id.fold();
+
+  mysql_mutex_lock(&buf_pool.mutex);
+
+  if (buf_page_t *bpage= buf_pool.page_hash_get_low(page_id, fold))
+    if (!buf_LRU_free_page(bpage, all) && all && bpage->zip.data)
+      /* Attempt to deallocate the redundant copy of the uncompressed page
+      if the whole ROW_FORMAT=COMPRESSED block cannot be deallocted. */
+      buf_LRU_free_page(bpage, false);
+
+  mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/** Helper class used while writing blob pages, during insert or update. */
+struct btr_blob_log_check_t {
+	/** Persistent cursor on a clusterex index record with blobs. */
+	btr_pcur_t*	m_pcur;
+	/** Mini transaction holding the latches for m_pcur */
+	mtr_t*		m_mtr;
+	/** rec_get_offsets(rec, index); offset of clust_rec */
+	const rec_offs*	m_offsets;
+	/** The block containing clustered record */
+	buf_block_t**	m_block;
+	/** The clustered record pointer */
+	rec_t**		m_rec;
+	/** The blob operation code */
+	enum blob_op	m_op;
+
+	/** Constructor
+	@param[in]	pcur		persistent cursor on a clustered
+					index record with blobs.
+	@param[in]	mtr		mini-transaction holding latches for
+					pcur.
+	@param[in]	offsets		offsets of the clust_rec
+	@param[in,out]	block		record block containing pcur record
+	@param[in,out]	rec		the clustered record pointer
+	@param[in]	op		the blob operation code */
+	btr_blob_log_check_t(
+		btr_pcur_t*	pcur,
+		mtr_t*		mtr,
+		const rec_offs*	offsets,
+		buf_block_t**	block,
+		rec_t**		rec,
+		enum blob_op	op)
+		: m_pcur(pcur),
+		  m_mtr(mtr),
+		  m_offsets(offsets),
+		  m_block(block),
+		  m_rec(rec),
+		  m_op(op)
+	{
+		ut_ad(rec_offs_validate(*m_rec, m_pcur->index(), m_offsets));
+		ut_ad((*m_block)->frame == page_align(*m_rec));
+		ut_ad(*m_rec == btr_pcur_get_rec(m_pcur));
+	}
+
+	/** Check if there is enough space in log file. Commit and re-start the
+	mini transaction. */
+	void check()
+	{
+		dict_index_t*	index = m_pcur->index();
+		ulint		offs = 0;
+		uint32_t	page_no = FIL_NULL;
+
+		if (UNIV_UNLIKELY(m_op == BTR_STORE_INSERT_BULK)) {
+			offs = page_offset(*m_rec);
+			page_no = (*m_block)->page.id().page_no();
+			buf_block_buf_fix_inc(*m_block, __FILE__, __LINE__);
+			ut_ad(page_no != FIL_NULL);
+		} else {
+			btr_pcur_store_position(m_pcur, m_mtr);
+		}
+		m_mtr->commit();
+
+		DEBUG_SYNC_C("blob_write_middle");
+
+		log_free_check();
+
+		DEBUG_SYNC_C("blob_write_middle_after_check");
+
+		const mtr_log_t log_mode = m_mtr->get_log_mode();
+		m_mtr->start();
+		m_mtr->set_log_mode(log_mode);
+		index->set_modified(*m_mtr);
+
+		if (UNIV_UNLIKELY(page_no != FIL_NULL)) {
+			m_pcur->btr_cur.page_cur.block = btr_block_get(
+				*index, page_no, RW_X_LATCH, false, m_mtr);
+			m_pcur->btr_cur.page_cur.rec
+				= m_pcur->btr_cur.page_cur.block->frame
+				+ offs;
+
+			buf_block_buf_fix_dec(m_pcur->btr_cur.page_cur.block);
+		} else {
+			ut_ad(m_pcur->rel_pos == BTR_PCUR_ON);
+			bool ret = btr_pcur_restore_position(
+				BTR_MODIFY_LEAF | BTR_MODIFY_EXTERNAL,
+				m_pcur, m_mtr);
+
+			ut_a(ret);
+		}
+
+		*m_block	= btr_pcur_get_block(m_pcur);
+		*m_rec		= btr_pcur_get_rec(m_pcur);
+
+		rec_offs_make_valid(*m_rec, index, true,
+				    const_cast<rec_offs*>(m_offsets));
+
+		ut_ad(m_mtr->memo_contains_page_flagged(
+		      *m_rec,
+		      MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX));
+
+		ut_ad((m_op == BTR_STORE_INSERT_BULK)
+		      == !m_mtr->memo_contains_flagged(&index->lock,
+						       MTR_MEMO_SX_LOCK
+						       | MTR_MEMO_X_LOCK));
+	}
+};
+
+/*******************************************************************//**
+Stores the fields in big_rec_vec to the tablespace and puts pointers to
+them in rec.  The extern flags in rec will have to be set beforehand.
+The fields are stored on pages allocated from leaf node
+file segment of the index tree.
+
+TODO: If the allocation extends the tablespace, it will not be redo logged, in
+any mini-transaction.  Tablespace extension should be redo-logged, so that
+recovery will not fail when the big_rec was written to the extended portion of
+the file, in case the file was somehow truncated in the crash.
+
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+dberr_t
+btr_store_big_rec_extern_fields(
+/*============================*/
+	btr_pcur_t*	pcur,		/*!< in/out: a persistent cursor. if
+					btr_mtr is restarted, then this can
+					be repositioned. */
+	rec_offs*	offsets,	/*!< in/out: rec_get_offsets() on
+					pcur. the "external storage" flags
+					in offsets will correctly correspond
+					to rec when this function returns */
+	const big_rec_t*big_rec_vec,	/*!< in: vector containing fields
+					to be stored externally */
+	mtr_t*		btr_mtr,	/*!< in/out: mtr containing the
+					latches to the clustered index. can be
+					committed and restarted. */
+	enum blob_op	op)		/*! in: operation code */
+{
+	byte*		field_ref;
+	ulint		extern_len;
+	ulint		store_len;
+	ulint		space_id;
+	ulint		i;
+	mtr_t		mtr;
+	mem_heap_t*	heap = NULL;
+	page_zip_des_t*	page_zip;
+	z_stream	c_stream;
+	dberr_t		error		= DB_SUCCESS;
+	dict_index_t*	index		= pcur->index();
+	buf_block_t*	rec_block	= btr_pcur_get_block(pcur);
+	rec_t*		rec		= btr_pcur_get_rec(pcur);
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(rec_offs_any_extern(offsets));
+	ut_ad(op == BTR_STORE_INSERT_BULK
+	      || btr_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+						| MTR_MEMO_SX_LOCK));
+	ut_ad(btr_mtr->memo_contains_flagged(rec_block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
+	ut_a(dict_index_is_clust(index));
+
+	btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block,
+				      &rec, op);
+	page_zip = buf_block_get_page_zip(rec_block);
+	space_id = rec_block->page.id().space();
+	ut_a(fil_page_index_page_check(page_align(rec))
+	     || op == BTR_STORE_INSERT_BULK);
+
+	if (page_zip) {
+		int	err;
+
+		/* Zlib deflate needs 128 kilobytes for the default
+		window size, plus 512 << memLevel, plus a few
+		kilobytes for small objects.  We use reduced memLevel
+		to limit the memory consumption, and preallocate the
+		heap, hoping to avoid memory fragmentation. */
+		heap = mem_heap_create(250000);
+		page_zip_set_alloc(&c_stream, heap);
+
+		err = deflateInit2(&c_stream, int(page_zip_level),
+				   Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
+		ut_a(err == Z_OK);
+	}
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	/* All pointers to externally stored columns in the record
+	must either be zero or they must be pointers to inherited
+	columns, owned by this record or an earlier record version. */
+	for (i = 0; i < big_rec_vec->n_fields; i++) {
+		field_ref = btr_rec_get_field_ref(
+			rec, offsets, big_rec_vec->fields[i].field_no);
+
+		ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
+		/* Either this must be an update in place,
+		or the BLOB must be inherited, or the BLOB pointer
+		must be zero (will be written in this function). */
+		ut_a(op == BTR_STORE_UPDATE
+		     || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
+		     || !memcmp(field_ref, field_ref_zero,
+				BTR_EXTERN_FIELD_REF_SIZE));
+	}
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+	/* Space available in compressed page to carry blob data */
+	const ulint	payload_size_zip = rec_block->physical_size()
+		- FIL_PAGE_DATA;
+
+	/* Space available in uncompressed page to carry blob data */
+	const ulint	payload_size = payload_size_zip
+		- (BTR_BLOB_HDR_SIZE + FIL_PAGE_DATA_END);
+
+	/* We have to create a file segment to the tablespace
+	for each field and put the pointer to the field in rec */
+
+	for (i = 0; i < big_rec_vec->n_fields; i++) {
+		const ulint field_no = big_rec_vec->fields[i].field_no;
+
+		field_ref = btr_rec_get_field_ref(rec, offsets, field_no);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+		/* A zero BLOB pointer should have been initially inserted. */
+		ut_a(!memcmp(field_ref, field_ref_zero,
+			     BTR_EXTERN_FIELD_REF_SIZE));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+		extern_len = big_rec_vec->fields[i].len;
+		MEM_CHECK_DEFINED(big_rec_vec->fields[i].data, extern_len);
+		ut_a(extern_len > 0);
+
+		uint32_t prev_page_no = FIL_NULL;
+
+		if (page_zip) {
+			int	err = deflateReset(&c_stream);
+			ut_a(err == Z_OK);
+
+			c_stream.next_in = (Bytef*)
+				big_rec_vec->fields[i].data;
+			c_stream.avail_in = static_cast<uInt>(extern_len);
+		}
+
+		for (ulint blob_npages = 0;; ++blob_npages) {
+			buf_block_t*	block;
+			const ulint	commit_freq = 4;
+			uint32_t	r_extents;
+
+			ut_ad(page_align(field_ref) == page_align(rec));
+
+			if (!(blob_npages % commit_freq)) {
+
+				redo_log.check();
+
+				field_ref = btr_rec_get_field_ref(
+					rec, offsets, field_no);
+
+				page_zip = buf_block_get_page_zip(rec_block);
+			}
+
+			mtr.start();
+			index->set_modified(mtr);
+			mtr.set_log_mode(btr_mtr->get_log_mode());
+
+			buf_page_get(rec_block->page.id(),
+				     rec_block->zip_size(), RW_X_LATCH, &mtr);
+
+			uint32_t hint_prev = prev_page_no;
+			if (hint_prev == FIL_NULL) {
+				hint_prev = rec_block->page.id().page_no();
+			}
+
+			if (!fsp_reserve_free_extents(&r_extents,
+						      index->table->space, 1,
+						      FSP_BLOB, &mtr, 1)) {
+				mtr.commit();
+				error = DB_OUT_OF_FILE_SPACE;
+				goto func_exit;
+			}
+
+			block = btr_page_alloc(index, hint_prev + 1,
+					       FSP_NO_DIR, 0, &mtr, &mtr);
+
+			index->table->space->release_free_extents(r_extents);
+
+			ut_a(block != NULL);
+
+			const uint32_t page_no = block->page.id().page_no();
+
+			if (prev_page_no != FIL_NULL) {
+				buf_block_t*	prev_block;
+
+				prev_block = buf_page_get(
+					page_id_t(space_id, prev_page_no),
+					rec_block->zip_size(),
+					RW_X_LATCH, &mtr);
+
+				buf_block_dbg_add_level(prev_block,
+							SYNC_EXTERN_STORAGE);
+
+				if (page_zip) {
+					mtr.write<4>(*prev_block,
+						     prev_block->frame
+						     + FIL_PAGE_NEXT,
+						     page_no);
+					memcpy_aligned<4>(
+						buf_block_get_page_zip(
+							prev_block)
+						->data + FIL_PAGE_NEXT,
+						prev_block->frame
+						+ FIL_PAGE_NEXT, 4);
+				} else {
+					mtr.write<4>(*prev_block,
+						     BTR_BLOB_HDR_NEXT_PAGE_NO
+						     + FIL_PAGE_DATA
+						     + prev_block->frame,
+						     page_no);
+				}
+			} else if (dict_index_is_online_ddl(index)) {
+				row_log_table_blob_alloc(index, page_no);
+			}
+
+			ut_ad(!page_has_siblings(block->frame));
+			ut_ad(!fil_page_get_type(block->frame));
+
+			if (page_zip) {
+				int		err;
+				page_zip_des_t*	blob_page_zip;
+
+				mtr.write<1>(*block,
+					     FIL_PAGE_TYPE + 1 + block->frame,
+					     prev_page_no == FIL_NULL
+					     ? FIL_PAGE_TYPE_ZBLOB
+					     : FIL_PAGE_TYPE_ZBLOB2);
+				block->page.zip.data[FIL_PAGE_TYPE + 1]
+					= block->frame[FIL_PAGE_TYPE + 1];
+
+				c_stream.next_out = block->frame
+					+ FIL_PAGE_DATA;
+				c_stream.avail_out = static_cast<uInt>(
+					payload_size_zip);
+
+				err = deflate(&c_stream, Z_FINISH);
+				ut_a(err == Z_OK || err == Z_STREAM_END);
+				ut_a(err == Z_STREAM_END
+				     || c_stream.avail_out == 0);
+
+				mtr.memcpy(*block,
+					   FIL_PAGE_DATA,
+					   page_zip_get_size(page_zip)
+					   - FIL_PAGE_DATA
+					   - c_stream.avail_out);
+				/* Copy the page to compressed storage,
+				because it will be flushed to disk
+				from there. */
+				blob_page_zip = buf_block_get_page_zip(block);
+				ut_ad(blob_page_zip);
+				ut_ad(page_zip_get_size(blob_page_zip)
+				      == page_zip_get_size(page_zip));
+				memcpy(blob_page_zip->data, block->frame,
+				       page_zip_get_size(page_zip));
+
+				if (err == Z_OK && prev_page_no != FIL_NULL) {
+
+					goto next_zip_page;
+				}
+
+				if (err == Z_STREAM_END) {
+					mach_write_to_4(field_ref
+							+ BTR_EXTERN_LEN, 0);
+					mach_write_to_4(field_ref
+							+ BTR_EXTERN_LEN + 4,
+							c_stream.total_in);
+				} else {
+					memset(field_ref + BTR_EXTERN_LEN,
+					       0, 8);
+				}
+
+				if (prev_page_no == FIL_NULL) {
+					ut_ad(blob_npages == 0);
+					mach_write_to_4(field_ref
+							+ BTR_EXTERN_SPACE_ID,
+							space_id);
+
+					mach_write_to_4(field_ref
+							+ BTR_EXTERN_PAGE_NO,
+							page_no);
+
+					mach_write_to_4(field_ref
+							+ BTR_EXTERN_OFFSET,
+							FIL_PAGE_NEXT);
+				}
+
+				/* We compress a page when finish bulk insert.*/
+				if (UNIV_LIKELY(op != BTR_STORE_INSERT_BULK)) {
+					page_zip_write_blob_ptr(
+						rec_block, rec, index, offsets,
+						field_no, &mtr);
+				}
+
+next_zip_page:
+				prev_page_no = page_no;
+
+				/* Commit mtr and release the
+				uncompressed page frame to save memory. */
+				btr_blob_free(block, FALSE, &mtr);
+
+				if (err == Z_STREAM_END) {
+					break;
+				}
+			} else {
+				mtr.write<1>(*block, FIL_PAGE_TYPE + 1
+					     + block->frame,
+					     FIL_PAGE_TYPE_BLOB);
+
+				if (extern_len > payload_size) {
+					store_len = payload_size;
+				} else {
+					store_len = extern_len;
+				}
+
+				mtr.memcpy<mtr_t::MAYBE_NOP>(
+					*block,
+					FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE
+					+ block->frame,
+					static_cast<const byte*>
+					(big_rec_vec->fields[i].data)
+					+ big_rec_vec->fields[i].len
+					- extern_len, store_len);
+				mtr.write<4>(*block, BTR_BLOB_HDR_PART_LEN
+					     + FIL_PAGE_DATA + block->frame,
+					     store_len);
+				compile_time_assert(FIL_NULL == 0xffffffff);
+				mtr.memset(block, BTR_BLOB_HDR_NEXT_PAGE_NO
+					   + FIL_PAGE_DATA, 4, 0xff);
+
+				extern_len -= store_len;
+
+				ut_ad(!mach_read_from_4(BTR_EXTERN_LEN
+							+ field_ref));
+				mtr.write<4>(*rec_block,
+					     BTR_EXTERN_LEN + 4 + field_ref,
+					     big_rec_vec->fields[i].len
+					     - extern_len);
+
+				if (prev_page_no == FIL_NULL) {
+					ut_ad(blob_npages == 0);
+					mtr.write<4,mtr_t::MAYBE_NOP>(
+						*rec_block,
+						field_ref + BTR_EXTERN_SPACE_ID,
+						space_id);
+
+					mtr.write<4>(*rec_block, field_ref
+						     + BTR_EXTERN_PAGE_NO,
+						     page_no);
+
+					mtr.write<4>(*rec_block, field_ref
+						     + BTR_EXTERN_OFFSET,
+						     FIL_PAGE_DATA);
+				}
+
+				prev_page_no = page_no;
+
+				mtr.commit();
+
+				if (extern_len == 0) {
+					break;
+				}
+			}
+		}
+
+		DBUG_EXECUTE_IF("btr_store_big_rec_extern",
+				error = DB_OUT_OF_FILE_SPACE;
+				goto func_exit;);
+
+		rec_offs_make_nth_extern(offsets, field_no);
+	}
+
+func_exit:
+	if (page_zip) {
+		deflateEnd(&c_stream);
+	}
+
+	if (heap != NULL) {
+		mem_heap_free(heap);
+	}
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	/* All pointers to externally stored columns in the record
+	must be valid. */
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		if (!rec_offs_nth_extern(offsets, i)) {
+			continue;
+		}
+
+		field_ref = btr_rec_get_field_ref(rec, offsets, i);
+
+		/* The pointer must not be zero if the operation
+		succeeded. */
+		ut_a(0 != memcmp(field_ref, field_ref_zero,
+				 BTR_EXTERN_FIELD_REF_SIZE)
+		     || error != DB_SUCCESS);
+		/* The column must not be disowned by this record. */
+		ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
+	}
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+	return(error);
+}
+
+/** Check the FIL_PAGE_TYPE on an uncompressed BLOB page.
+@param[in]      block   uncompressed BLOB page
+@param[in]      read    true=read, false=purge */
+static void btr_check_blob_fil_page_type(const buf_block_t& block, bool read)
+{
+  uint16_t type= fil_page_get_type(block.frame);
+
+  if (UNIV_LIKELY(type == FIL_PAGE_TYPE_BLOB))
+    return;
+  /* FIXME: take the tablespace as a parameter */
+  if (fil_space_t *space= fil_space_t::get(block.page.id().space()))
+  {
+    /* Old versions of InnoDB did not initialize FIL_PAGE_TYPE on BLOB
+    pages.  Do not print anything about the type mismatch when reading
+    a BLOB page that may be from old versions. */
+    if (space->full_crc32() || DICT_TF_HAS_ATOMIC_BLOBS(space->flags))
+    {
+      ib::fatal() << "FIL_PAGE_TYPE=" << type
+		  << (read ? " on BLOB read file " : " on BLOB purge file ")
+		  << space->chain.start->name
+		  << " page " << block.page.id().page_no();
+    }
+    space->release();
+  }
+}
+
+/*******************************************************************//**
+Frees the space in an externally stored field to the file space
+management if the field in data is owned by the externally stored field,
+in a rollback we may have the additional condition that the field must
+not be inherited. */
+void
+btr_free_externally_stored_field(
+/*=============================*/
+	dict_index_t*	index,		/*!< in: index of the data, the index
+					tree MUST be X-latched; if the tree
+					height is 1, then also the root page
+					must be X-latched! (this is relevant
+					in the case this function is called
+					from purge where 'data' is located on
+					an undo log page, not an index
+					page) */
+	byte*		field_ref,	/*!< in/out: field reference */
+	const rec_t*	rec,		/*!< in: record containing field_ref, for
+					page_zip_write_blob_ptr(), or NULL */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index),
+					or NULL */
+	buf_block_t*	block,		/*!< in/out: page of field_ref */
+	ulint		i,		/*!< in: field number of field_ref;
+					ignored if rec == NULL */
+	bool		rollback,	/*!< in: performing rollback? */
+	mtr_t*		local_mtr)	/*!< in: mtr
+					containing the latch to data an an
+					X-latch to the index tree */
+{
+	page_t*		page;
+	const uint32_t	space_id	= mach_read_from_4(
+		field_ref + BTR_EXTERN_SPACE_ID);
+	const uint32_t	start_page	= mach_read_from_4(
+		field_ref + BTR_EXTERN_PAGE_NO);
+	uint32_t	page_no;
+	uint32_t	next_page_no;
+	mtr_t		mtr;
+
+	ut_ad(index->is_primary());
+	ut_ad(local_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					       | MTR_MEMO_SX_LOCK));
+	ut_ad(local_mtr->memo_contains_page_flagged(field_ref,
+						    MTR_MEMO_PAGE_X_FIX));
+	ut_ad(!rec || rec_offs_validate(rec, index, offsets));
+	ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
+	ut_ad(local_mtr->is_named_space(
+		      page_get_space_id(page_align(field_ref))));
+
+	if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
+				  BTR_EXTERN_FIELD_REF_SIZE))) {
+		/* In the rollback, we may encounter a clustered index
+		record with some unwritten off-page columns. There is
+		nothing to free then. */
+		ut_a(rollback);
+		return;
+	}
+
+	ut_ad(!(mach_read_from_4(field_ref + BTR_EXTERN_LEN)
+	        & ~((BTR_EXTERN_OWNER_FLAG
+	             | BTR_EXTERN_INHERITED_FLAG) << 24)));
+	ut_ad(space_id == index->table->space->id);
+	ut_ad(space_id == index->table->space_id);
+
+	const ulint ext_zip_size = index->table->space->zip_size();
+	const ulint rec_zip_size = rec ? ext_zip_size : 0;
+
+	/* !rec holds in a call from purge when field_ref is in an undo page */
+	ut_ad(rec || !block->page.zip.data);
+
+	for (;;) {
+#ifdef UNIV_DEBUG
+		buf_block_t*	rec_block;
+#endif /* UNIV_DEBUG */
+		buf_block_t*	ext_block;
+
+		mtr_start(&mtr);
+		mtr.set_spaces(*local_mtr);
+		mtr.set_log_mode(local_mtr->get_log_mode());
+
+		ut_ad(!index->table->is_temporary()
+		      || local_mtr->get_log_mode() == MTR_LOG_NO_REDO);
+
+		const page_t*	p = page_align(field_ref);
+
+		const page_id_t	page_id(page_get_space_id(p),
+					page_get_page_no(p));
+
+#ifdef UNIV_DEBUG
+		rec_block =
+#endif /* UNIV_DEBUG */
+		buf_page_get(page_id, rec_zip_size, RW_X_LATCH, &mtr);
+
+		buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
+		page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
+
+		if (/* There is no external storage data */
+		    page_no == FIL_NULL
+		    /* This field does not own the externally stored field */
+		    || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
+			& BTR_EXTERN_OWNER_FLAG)
+		    /* Rollback and inherited field */
+		    || (rollback
+			&& (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
+			    & BTR_EXTERN_INHERITED_FLAG))) {
+
+			/* Do not free */
+			mtr_commit(&mtr);
+
+			return;
+		}
+
+		if (page_no == start_page && dict_index_is_online_ddl(index)) {
+			row_log_table_blob_free(index, start_page);
+		}
+
+		ext_block = buf_page_get(
+			page_id_t(space_id, page_no), ext_zip_size,
+			RW_X_LATCH, &mtr);
+
+		buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
+		page = buf_block_get_frame(ext_block);
+
+		if (ext_zip_size) {
+			/* Note that page_zip will be NULL
+			in row_purge_upd_exist_or_extern(). */
+			switch (fil_page_get_type(page)) {
+			case FIL_PAGE_TYPE_ZBLOB:
+			case FIL_PAGE_TYPE_ZBLOB2:
+				break;
+			default:
+				ut_error;
+			}
+			next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
+
+			btr_page_free(index, ext_block, &mtr, true);
+
+			if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+				mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
+						next_page_no);
+				memset(field_ref + BTR_EXTERN_LEN + 4, 0, 4);
+				page_zip_write_blob_ptr(block, rec, index,
+							offsets, i, &mtr);
+			} else {
+				mtr.write<4>(*block,
+					     BTR_EXTERN_PAGE_NO + field_ref,
+					     next_page_no);
+				mtr.write<4,mtr_t::MAYBE_NOP>(*block,
+							      BTR_EXTERN_LEN
+							      + 4 + field_ref,
+							      0U);
+			}
+		} else {
+			ut_ad(!block->page.zip.data);
+			btr_check_blob_fil_page_type(*ext_block, false);
+
+			next_page_no = mach_read_from_4(
+				page + FIL_PAGE_DATA
+				+ BTR_BLOB_HDR_NEXT_PAGE_NO);
+			btr_page_free(index, ext_block, &mtr, true);
+
+			mtr.write<4>(*block, BTR_EXTERN_PAGE_NO + field_ref,
+				     next_page_no);
+			/* Zero out the BLOB length.  If the server
+			crashes during the execution of this function,
+			trx_rollback_all_recovered() could
+			dereference the half-deleted BLOB, fetching a
+			wrong prefix for the BLOB. */
+			mtr.write<4,mtr_t::MAYBE_NOP>(*block,
+						      BTR_EXTERN_LEN + 4
+						      + field_ref, 0U);
+		}
+
+		/* Commit mtr and release the BLOB block to save memory. */
+		btr_blob_free(ext_block, TRUE, &mtr);
+	}
+}
+
+/***********************************************************//**
+Frees the externally stored fields for a record. */
+static
+void
+btr_rec_free_externally_stored_fields(
+/*==================================*/
+	dict_index_t*	index,	/*!< in: index of the data, the index
+				tree MUST be X-latched */
+	rec_t*		rec,	/*!< in/out: record */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	buf_block_t*	block,	/*!< in: index page of rec */
+	bool		rollback,/*!< in: performing rollback? */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
+				an X-latch to record page and to the index
+				tree */
+{
+	ulint	n_fields;
+	ulint	i;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(index->is_primary());
+	ut_ad(page_rec_is_leaf(rec));
+	/* Free possible externally stored fields in the record */
+
+	ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
+	n_fields = rec_offs_n_fields(offsets);
+
+	for (i = 0; i < n_fields; i++) {
+		if (rec_offs_nth_extern(offsets, i)) {
+			btr_free_externally_stored_field(
+				index, btr_rec_get_field_ref(rec, offsets, i),
+				rec, offsets, block, i, rollback, mtr);
+		}
+	}
+}
+
+/***********************************************************//**
+Frees the externally stored fields for a record, if the field is mentioned
+in the update vector. */
+static
+void
+btr_rec_free_updated_extern_fields(
+/*===============================*/
+	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
+				X-latched */
+	rec_t*		rec,	/*!< in/out: record */
+	buf_block_t*	block,	/*!< in: index page of rec */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update,	/*!< in: update vector */
+	bool		rollback,/*!< in: performing rollback? */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
+				an X-latch to record page and to the tree */
+{
+	ulint	n_fields;
+	ulint	i;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX));
+
+	/* Free possible externally stored fields in the record */
+
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		const upd_field_t* ufield = upd_get_nth_field(update, i);
+
+		if (rec_offs_nth_extern(offsets, ufield->field_no)) {
+			ulint	len;
+			byte*	data = rec_get_nth_field(
+				rec, offsets, ufield->field_no, &len);
+			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+			btr_free_externally_stored_field(
+				index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
+				rec, offsets, block,
+				ufield->field_no, rollback, mtr);
+		}
+	}
+}
+
+/*******************************************************************//**
+Copies the prefix of an uncompressed BLOB.  The clustered index record
+that points to this BLOB must be protected by a lock or a page latch.
+@return number of bytes written to buf */
+static
+ulint
+btr_copy_blob_prefix(
+/*=================*/
+	byte*		buf,	/*!< out: the externally stored part of
+				the field, or a prefix of it */
+	uint32_t	len,	/*!< in: length of buf, in bytes */
+	page_id_t	id,	/*!< in: page identifier of the first BLOB page */
+	uint32_t	offset)	/*!< in: offset on the first BLOB page */
+{
+	ulint	copied_len	= 0;
+
+	for (;;) {
+		mtr_t		mtr;
+		buf_block_t*	block;
+		const page_t*	page;
+		const byte*	blob_header;
+		ulint		part_len;
+		ulint		copy_len;
+
+		mtr_start(&mtr);
+
+		block = buf_page_get(id, 0, RW_S_LATCH, &mtr);
+		buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
+		page = buf_block_get_frame(block);
+
+		btr_check_blob_fil_page_type(*block, true);
+
+		blob_header = page + offset;
+		part_len = btr_blob_get_part_len(blob_header);
+		copy_len = ut_min(part_len, len - copied_len);
+
+		memcpy(buf + copied_len,
+		       blob_header + BTR_BLOB_HDR_SIZE, copy_len);
+		copied_len += copy_len;
+
+		id.set_page_no(btr_blob_get_next_page_no(blob_header));
+
+		mtr_commit(&mtr);
+
+		if (id.page_no() == FIL_NULL || copy_len != part_len) {
+			MEM_CHECK_DEFINED(buf, copied_len);
+			return(copied_len);
+		}
+
+		/* On other BLOB pages except the first the BLOB header
+		always is at the page data start: */
+
+		offset = FIL_PAGE_DATA;
+
+		ut_ad(copied_len <= len);
+	}
+}
+
+/** Copies the prefix of a compressed BLOB.
+The clustered index record that points to this BLOB must be protected
+by a lock or a page latch.
+@param[out]	buf		the externally stored part of the field,
+or a prefix of it
+@param[in]	len		length of buf, in bytes
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size
+@param[in]	id		page identifier of the BLOB pages
+@return number of bytes written to buf */
+static
+ulint
+btr_copy_zblob_prefix(
+	byte*			buf,
+	uint32_t		len,
+	ulint			zip_size,
+	page_id_t		id,
+	uint32_t		offset)
+{
+	ulint		page_type = FIL_PAGE_TYPE_ZBLOB;
+	mem_heap_t*	heap;
+	int		err;
+	z_stream	d_stream;
+
+	d_stream.next_out = buf;
+	d_stream.avail_out = static_cast<uInt>(len);
+	d_stream.next_in = Z_NULL;
+	d_stream.avail_in = 0;
+
+	/* Zlib inflate needs 32 kilobytes for the default
+	window size, plus a few kilobytes for small objects. */
+	heap = mem_heap_create(40000);
+	page_zip_set_alloc(&d_stream, heap);
+
+	ut_ad(zip_size);
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(id.space());
+
+	err = inflateInit(&d_stream);
+	ut_a(err == Z_OK);
+
+	for (;;) {
+		buf_page_t*	bpage;
+		uint32_t	next_page_no;
+
+		/* There is no latch on bpage directly.  Instead,
+		bpage is protected by the B-tree page latch that
+		is being held on the clustered index record, or,
+		in row_merge_copy_blobs(), by an exclusive table lock. */
+		bpage = buf_page_get_zip(id, zip_size);
+
+		if (UNIV_UNLIKELY(!bpage)) {
+			ib::error() << "Cannot load compressed BLOB " << id;
+			goto func_exit;
+		}
+
+		if (UNIV_UNLIKELY
+		    (fil_page_get_type(bpage->zip.data) != page_type)) {
+
+			ib::error() << "Unexpected type "
+				<< fil_page_get_type(bpage->zip.data)
+				<< " of compressed BLOB page " << id;
+
+			ut_ad(0);
+			goto end_of_blob;
+		}
+
+		next_page_no = mach_read_from_4(bpage->zip.data + offset);
+
+		if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
+			/* When the BLOB begins at page header,
+			the compressed data payload does not
+			immediately follow the next page pointer. */
+			offset = FIL_PAGE_DATA;
+		} else {
+			offset += 4;
+		}
+
+		d_stream.next_in = bpage->zip.data + offset;
+		d_stream.avail_in = uInt(zip_size - offset);
+
+		err = inflate(&d_stream, Z_NO_FLUSH);
+		switch (err) {
+		case Z_OK:
+			if (!d_stream.avail_out) {
+				goto end_of_blob;
+			}
+			break;
+		case Z_STREAM_END:
+			if (next_page_no == FIL_NULL) {
+				goto end_of_blob;
+			}
+			/* fall through */
+		default:
+inflate_error:
+			ib::error() << "inflate() of compressed BLOB page "
+				<< id
+				<< " returned " << err
+				<< " (" << d_stream.msg << ")";
+
+		case Z_BUF_ERROR:
+			goto end_of_blob;
+		}
+
+		if (next_page_no == FIL_NULL) {
+			if (!d_stream.avail_in) {
+				ib::error()
+					<< "Unexpected end of compressed "
+					<< "BLOB page " << id;
+			} else {
+				err = inflate(&d_stream, Z_FINISH);
+				switch (err) {
+				case Z_STREAM_END:
+				case Z_BUF_ERROR:
+					break;
+				default:
+					goto inflate_error;
+				}
+			}
+
+end_of_blob:
+			buf_page_release_zip(bpage);
+			goto func_exit;
+		}
+
+		buf_page_release_zip(bpage);
+
+		/* On other BLOB pages except the first
+		the BLOB header always is at the page header: */
+
+		id.set_page_no(next_page_no);
+		offset = FIL_PAGE_NEXT;
+		page_type = FIL_PAGE_TYPE_ZBLOB2;
+	}
+
+func_exit:
+	inflateEnd(&d_stream);
+	mem_heap_free(heap);
+	MEM_CHECK_DEFINED(buf, d_stream.total_out);
+	return(d_stream.total_out);
+}
+
+/** Copies the prefix of an externally stored field of a record.
+The clustered index record that points to this BLOB must be protected
+by a lock or a page latch.
+@param[out]	buf		the externally stored part of the
+field, or a prefix of it
+@param[in]	len		length of buf, in bytes
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	id		page identifier of the first BLOB page
+@param[in]	offset		offset on the first BLOB page
+@return number of bytes written to buf */
+static
+ulint
+btr_copy_externally_stored_field_prefix_low(
+	byte*			buf,
+	uint32_t		len,
+	ulint			zip_size,
+	page_id_t		id,
+	uint32_t		offset)
+{
+  if (len == 0)
+    return 0;
+
+  return zip_size
+    ? btr_copy_zblob_prefix(buf, len, zip_size, id, offset)
+    : btr_copy_blob_prefix(buf, len, id, offset);
+}
+
+/** Copies the prefix of an externally stored field of a record.
+The clustered index record must be protected by a lock or a page latch.
+@param[out]	buf		the field, or a prefix of it
+@param[in]	len		length of buf, in bytes
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	data		'internally' stored part of the field
+containing also the reference to the external part; must be protected by
+a lock or a page latch
+@param[in]	local_len	length of data, in bytes
+@return the length of the copied field, or 0 if the column was being
+or has been deleted */
+ulint
+btr_copy_externally_stored_field_prefix(
+	byte*			buf,
+	ulint			len,
+	ulint			zip_size,
+	const byte*		data,
+	ulint			local_len)
+{
+	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+	if (UNIV_UNLIKELY(local_len >= len)) {
+		memcpy(buf, data, len);
+		return(len);
+	}
+
+	memcpy(buf, data, local_len);
+	data += local_len;
+
+	ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
+
+	if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
+		/* The externally stored part of the column has been
+		(partially) deleted.  Signal the half-deleted BLOB
+		to the caller. */
+
+		return(0);
+	}
+
+	uint32_t space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
+	uint32_t page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
+	uint32_t offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
+	len -= local_len;
+
+	return(local_len
+	       + btr_copy_externally_stored_field_prefix_low(buf + local_len,
+							     uint32_t(len),
+							     zip_size,
+							     page_id_t(
+								     space_id,
+								     page_no),
+							     offset));
+}
+
+/** Copies an externally stored field of a record to mem heap.
+The clustered index record must be protected by a lock or a page latch.
+@param[out]	len		length of the whole field
+@param[in]	data		'internally' stored part of the field
+containing also the reference to the external part; must be protected by
+a lock or a page latch
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	local_len	length of data
+@param[in,out]	heap		mem heap
+@return the whole field copied to heap */
+byte*
+btr_copy_externally_stored_field(
+	ulint*			len,
+	const byte*		data,
+	ulint			zip_size,
+	ulint			local_len,
+	mem_heap_t*		heap)
+{
+	byte*	buf;
+
+	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+	uint32_t space_id = mach_read_from_4(data + local_len
+					     + BTR_EXTERN_SPACE_ID);
+	uint32_t page_no = mach_read_from_4(data + local_len
+					    + BTR_EXTERN_PAGE_NO);
+	uint32_t offset = mach_read_from_4(data + local_len
+					   + BTR_EXTERN_OFFSET);
+
+	/* Currently a BLOB cannot be bigger than 4 GB; we
+	leave the 4 upper bytes in the length field unused */
+
+	uint32_t extern_len = mach_read_from_4(data + local_len
+					       + BTR_EXTERN_LEN + 4);
+
+	buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
+
+	memcpy(buf, data, local_len);
+	*len = local_len
+		+ btr_copy_externally_stored_field_prefix_low(buf + local_len,
+							      extern_len,
+							      zip_size,
+							      page_id_t(
+								      space_id,
+								      page_no),
+							      offset);
+
+	return(buf);
+}
+
+/** Copies an externally stored field of a record to mem heap.
+@param[in]	rec		record in a clustered index; must be
+protected by a lock or a page latch
+@param[in]	offset		array returned by rec_get_offsets()
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	no		field number
+@param[out]	len		length of the field
+@param[in,out]	heap		mem heap
+@return the field copied to heap, or NULL if the field is incomplete */
+byte*
+btr_rec_copy_externally_stored_field(
+	const rec_t*		rec,
+	const rec_offs*		offsets,
+	ulint			zip_size,
+	ulint			no,
+	ulint*			len,
+	mem_heap_t*		heap)
+{
+	ulint		local_len;
+	const byte*	data;
+
+	ut_a(rec_offs_nth_extern(offsets, no));
+
+	/* An externally stored field can contain some initial
+	data from the field, and in the last 20 bytes it has the
+	space id, page number, and offset where the rest of the
+	field data is stored, and the data length in addition to
+	the data stored locally. We may need to store some data
+	locally to get the local record length above the 128 byte
+	limit so that field offsets are stored in two bytes, and
+	the extern bit is available in those two bytes. */
+
+	data = rec_get_nth_field(rec, offsets, no, &local_len);
+
+	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	if (UNIV_UNLIKELY
+	    (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
+		     field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
+		/* The externally stored field was not written yet.
+		This record should only be seen by
+		trx_rollback_recovered() or any
+		TRX_ISO_READ_UNCOMMITTED transactions. */
+		return(NULL);
+	}
+
+	return(btr_copy_externally_stored_field(len, data,
+						zip_size, local_len, heap));
+}
diff --git a/storage/innobase/btr/btr0defragment.cc b/storage/innobase/btr/btr0defragment.cc
new file mode 100644
index 00000000..ebe9854b
--- /dev/null
+++ b/storage/innobase/btr/btr0defragment.cc
@@ -0,0 +1,843 @@
+/*****************************************************************************
+
+Copyright (C) 2012, 2014 Facebook, Inc. All Rights Reserved.
+Copyright (C) 2014, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**************************************************//**
+@file btr/btr0defragment.cc
+Index defragmentation.
+
+Created  05/29/2014 Rongrong Zhong
+Modified 16/07/2014 Sunguck Lee
+Modified 30/07/2014 Jan Lindström jan.lindstrom@mariadb.com
+*******************************************************/
+
+#include "btr0defragment.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "btr0pcur.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "dict0defrag_bg.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "srv0start.h"
+
+#include <list>
+
+/* When there's no work, either because defragment is disabled, or because no
+query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS.*/
+#define BTR_DEFRAGMENT_SLEEP_IN_USECS		1000000
+/* Reduce the target page size by this amount when compression failure happens
+during defragmentaiton. 512 is chosen because it's a power of 2 and it is about
+3% of the page size. When there are compression failures in defragmentation,
+our goal is to get a decent defrag ratio with as few compression failure as
+possible. From experimentation it seems that reduce the target size by 512 every
+time will make sure the page is compressible within a couple of iterations. */
+#define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE	512
+
+/** Item in the work queue for btr_degrament_thread. */
+struct btr_defragment_item_t
+{
+	btr_pcur_t*	pcur;		/* persistent cursor where
+					btr_defragment_n_pages should start */
+	os_event_t	event;		/* if not null, signal after work
+					is done */
+	bool		removed;	/* Mark an item as removed */
+	ulonglong	last_processed;	/* timestamp of last time this index
+					is processed by defragment thread */
+
+	btr_defragment_item_t(btr_pcur_t* pcur, os_event_t event);
+	~btr_defragment_item_t();
+};
+
+/* Work queue for defragmentation. */
+typedef std::list<btr_defragment_item_t*>	btr_defragment_wq_t;
+static btr_defragment_wq_t	btr_defragment_wq;
+
+/* Mutex protecting the defragmentation work queue.*/
+ib_mutex_t		btr_defragment_mutex;
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	btr_defragment_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/* Number of compression failures caused by defragmentation since server
+start. */
+Atomic_counter<ulint> btr_defragment_compression_failures;
+/* Number of btr_defragment_n_pages calls that altered page but didn't
+manage to release any page. */
+Atomic_counter<ulint> btr_defragment_failures;
+/* Total number of btr_defragment_n_pages calls that altered page.
+The difference between btr_defragment_count and btr_defragment_failures shows
+the amount of effort wasted. */
+Atomic_counter<ulint> btr_defragment_count;
+
+bool btr_defragment_active;
+
+struct defragment_chunk_state_t
+{
+	btr_defragment_item_t* m_item;
+};
+
+static defragment_chunk_state_t defragment_chunk_state;
+static void btr_defragment_chunk(void*);
+
+static tpool::timer* btr_defragment_timer;
+static tpool::task_group task_group(1);
+static tpool::task btr_defragment_task(btr_defragment_chunk, 0, &task_group);
+static void btr_defragment_start();
+
+/******************************************************************//**
+Constructor for btr_defragment_item_t. */
+btr_defragment_item_t::btr_defragment_item_t(
+	btr_pcur_t* pcur,
+	os_event_t event)
+{
+	this->pcur = pcur;
+	this->event = event;
+	this->removed = false;
+	this->last_processed = 0;
+}
+
+/******************************************************************//**
+Destructor for btr_defragment_item_t. */
+btr_defragment_item_t::~btr_defragment_item_t() {
+	if (this->pcur) {
+		btr_pcur_free_for_mysql(this->pcur);
+	}
+	if (this->event) {
+		os_event_set(this->event);
+	}
+}
+
+static void submit_defragment_task(void*arg=0)
+{
+	srv_thread_pool->submit_task(&btr_defragment_task);
+}
+
+/******************************************************************//**
+Initialize defragmentation. */
+void
+btr_defragment_init()
+{
+	srv_defragment_interval = 1000000000ULL / srv_defragment_frequency;
+	mutex_create(LATCH_ID_BTR_DEFRAGMENT_MUTEX, &btr_defragment_mutex);
+	defragment_chunk_state.m_item = 0;
+	btr_defragment_timer = srv_thread_pool->create_timer(submit_defragment_task);
+	btr_defragment_active = true;
+}
+
+/******************************************************************//**
+Shutdown defragmentation. Release all resources. */
+void
+btr_defragment_shutdown()
+{
+	if (!btr_defragment_timer)
+		return;
+	delete btr_defragment_timer;
+	btr_defragment_timer = 0;
+	task_group.cancel_pending(&btr_defragment_task);
+	mutex_enter(&btr_defragment_mutex);
+	std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+	while(iter != btr_defragment_wq.end()) {
+		btr_defragment_item_t* item = *iter;
+		iter = btr_defragment_wq.erase(iter);
+		delete item;
+	}
+	mutex_exit(&btr_defragment_mutex);
+	mutex_free(&btr_defragment_mutex);
+	btr_defragment_active = false;
+}
+
+
+/******************************************************************//**
+Functions used by the query threads: btr_defragment_xxx_index
+Query threads find/add/remove index. */
+/******************************************************************//**
+Check whether the given index is in btr_defragment_wq. We use index->id
+to identify indices. */
+bool
+btr_defragment_find_index(
+	dict_index_t*	index)	/*!< Index to find. */
+{
+	mutex_enter(&btr_defragment_mutex);
+	for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+	     iter != btr_defragment_wq.end();
+	     ++iter) {
+		btr_defragment_item_t* item = *iter;
+		btr_pcur_t* pcur = item->pcur;
+		btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+		dict_index_t* idx = btr_cur_get_index(cursor);
+		if (index->id == idx->id) {
+			mutex_exit(&btr_defragment_mutex);
+			return true;
+		}
+	}
+	mutex_exit(&btr_defragment_mutex);
+	return false;
+}
+
+/******************************************************************//**
+Query thread uses this function to add an index to btr_defragment_wq.
+Return a pointer to os_event for the query thread to wait on if this is a
+synchronized defragmentation. */
+os_event_t
+btr_defragment_add_index(
+	dict_index_t*	index,	/*!< index to be added  */
+	dberr_t*	err)	/*!< out: error code */
+{
+	mtr_t mtr;
+	*err = DB_SUCCESS;
+
+	mtr_start(&mtr);
+	buf_block_t* block = btr_root_block_get(index, RW_NO_LATCH, &mtr);
+	page_t* page = NULL;
+
+	if (block) {
+		page = buf_block_get_frame(block);
+	}
+
+	if (page == NULL && !index->is_readable()) {
+		mtr_commit(&mtr);
+		*err = DB_DECRYPTION_FAILED;
+		return NULL;
+	}
+
+	ut_ad(fil_page_index_page_check(page));
+	ut_ad(!page_has_siblings(page));
+
+	if (page_is_leaf(page)) {
+		// Index root is a leaf page, no need to defragment.
+		mtr_commit(&mtr);
+		return NULL;
+	}
+	btr_pcur_t* pcur = btr_pcur_create_for_mysql();
+	os_event_t event = os_event_create(0);
+	btr_pcur_open_at_index_side(true, index, BTR_SEARCH_LEAF, pcur,
+				    true, 0, &mtr);
+	btr_pcur_move_to_next(pcur, &mtr);
+	btr_pcur_store_position(pcur, &mtr);
+	mtr_commit(&mtr);
+	dict_stats_empty_defrag_summary(index);
+	btr_defragment_item_t*	item = new btr_defragment_item_t(pcur, event);
+	mutex_enter(&btr_defragment_mutex);
+	btr_defragment_wq.push_back(item);
+	if(btr_defragment_wq.size() == 1){
+		/* Kick off defragmentation work */
+		btr_defragment_start();
+	}
+	mutex_exit(&btr_defragment_mutex);
+	return event;
+}
+
+/******************************************************************//**
+When table is dropped, this function is called to mark a table as removed in
+btr_efragment_wq. The difference between this function and the remove_index
+function is this will not NULL the event. */
+void
+btr_defragment_remove_table(
+	dict_table_t*	table)	/*!< Index to be removed. */
+{
+	mutex_enter(&btr_defragment_mutex);
+	for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+	     iter != btr_defragment_wq.end();
+	     ++iter) {
+		btr_defragment_item_t* item = *iter;
+		btr_pcur_t* pcur = item->pcur;
+		btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+		dict_index_t* idx = btr_cur_get_index(cursor);
+		if (table->id == idx->table->id) {
+			item->removed = true;
+		}
+	}
+	mutex_exit(&btr_defragment_mutex);
+}
+
+/******************************************************************//**
+Query thread uses this function to mark an index as removed in
+btr_efragment_wq. */
+void
+btr_defragment_remove_index(
+	dict_index_t*	index)	/*!< Index to be removed. */
+{
+	mutex_enter(&btr_defragment_mutex);
+	for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+	     iter != btr_defragment_wq.end();
+	     ++iter) {
+		btr_defragment_item_t* item = *iter;
+		btr_pcur_t* pcur = item->pcur;
+		btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+		dict_index_t* idx = btr_cur_get_index(cursor);
+		if (index->id == idx->id) {
+			item->removed = true;
+			item->event = NULL;
+			break;
+		}
+	}
+	mutex_exit(&btr_defragment_mutex);
+}
+
+/******************************************************************//**
+Functions used by defragmentation thread: btr_defragment_xxx_item.
+Defragmentation thread operates on the work *item*. It gets/removes
+item from the work queue. */
+/******************************************************************//**
+Defragment thread uses this to remove an item from btr_defragment_wq.
+When an item is removed from the work queue, all resources associated with it
+are free as well. */
+void
+btr_defragment_remove_item(
+	btr_defragment_item_t*	item) /*!< Item to be removed. */
+{
+	mutex_enter(&btr_defragment_mutex);
+	for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+	     iter != btr_defragment_wq.end();
+	     ++iter) {
+		if (item == *iter) {
+			btr_defragment_wq.erase(iter);
+			delete item;
+			break;
+		}
+	}
+	mutex_exit(&btr_defragment_mutex);
+}
+
+/******************************************************************//**
+Defragment thread uses this to get an item from btr_defragment_wq to work on.
+The item is not removed from the work queue so query threads can still access
+this item. We keep it this way so query threads can find and kill a
+defragmentation even if that index is being worked on. Be aware that while you
+work on this item you have no lock protection on it whatsoever. This is OK as
+long as the query threads and defragment thread won't modify the same fields
+without lock protection.
+*/
+btr_defragment_item_t*
+btr_defragment_get_item()
+{
+	if (btr_defragment_wq.empty()) {
+		return NULL;
+		//return nullptr;
+	}
+	mutex_enter(&btr_defragment_mutex);
+	std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+	if (iter == btr_defragment_wq.end()) {
+		iter = btr_defragment_wq.begin();
+	}
+	btr_defragment_item_t* item = *iter;
+	iter++;
+	mutex_exit(&btr_defragment_mutex);
+	return item;
+}
+
+/*********************************************************************//**
+Check whether we should save defragmentation statistics to persistent storage.
+Currently we save the stats to persistent storage every 100 updates. */
+UNIV_INTERN
+void
+btr_defragment_save_defrag_stats_if_needed(
+	dict_index_t*	index)	/*!< in: index */
+{
+	if (srv_defragment_stats_accuracy != 0 // stats tracking disabled
+	    && index->table->space_id != 0 // do not track system tables
+	    && index->stat_defrag_modified_counter
+	       >= srv_defragment_stats_accuracy) {
+		dict_stats_defrag_pool_add(index);
+		index->stat_defrag_modified_counter = 0;
+	}
+}
+
+/*********************************************************************//**
+Main defragment functionalities used by defragment thread.*/
+/*************************************************************//**
+Calculate number of records from beginning of block that can
+fit into size_limit
+@return number of records */
+UNIV_INTERN
+ulint
+btr_defragment_calc_n_recs_for_size(
+	buf_block_t* block,	/*!< in: B-tree page */
+	dict_index_t* index,	/*!< in: index of the page */
+	ulint size_limit,	/*!< in: size limit to fit records in */
+	ulint* n_recs_size)	/*!< out: actual size of the records that fit
+				in size_limit. */
+{
+	page_t* page = buf_block_get_frame(block);
+	ulint n_recs = 0;
+	rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs* offsets = offsets_;
+	rec_offs_init(offsets_);
+	mem_heap_t* heap = NULL;
+	ulint size = 0;
+	page_cur_t cur;
+
+	const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0;
+	page_cur_set_before_first(block, &cur);
+	page_cur_move_to_next(&cur);
+	while (page_cur_get_rec(&cur) != page_get_supremum_rec(page)) {
+		rec_t* cur_rec = page_cur_get_rec(&cur);
+		offsets = rec_get_offsets(cur_rec, index, offsets, n_core,
+					  ULINT_UNDEFINED, &heap);
+		ulint rec_size = rec_offs_size(offsets);
+		size += rec_size;
+		if (size > size_limit) {
+			size = size - rec_size;
+			break;
+		}
+		n_recs ++;
+		page_cur_move_to_next(&cur);
+	}
+	*n_recs_size = size;
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return n_recs;
+}
+
+/*************************************************************//**
+Merge as many records from the from_block to the to_block. Delete
+the from_block if all records are successfully merged to to_block.
+@return the to_block to target for next merge operation. */
+static
+buf_block_t*
+btr_defragment_merge_pages(
+	dict_index_t*	index,		/*!< in: index tree */
+	buf_block_t*	from_block,	/*!< in: origin of merge */
+	buf_block_t*	to_block,	/*!< in: destination of merge */
+	ulint		zip_size,	/*!< in: ROW_FORMAT=COMPRESSED size */
+	ulint		reserved_space,	/*!< in: space reserved for future
+					insert to avoid immediate page split */
+	ulint*		max_data_size,	/*!< in/out: max data size to
+					fit in a single compressed page. */
+	mem_heap_t*	heap,		/*!< in/out: pointer to memory heap */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	page_t* from_page = buf_block_get_frame(from_block);
+	page_t* to_page = buf_block_get_frame(to_block);
+	ulint level = btr_page_get_level(from_page);
+	ulint n_recs = page_get_n_recs(from_page);
+	ulint new_data_size = page_get_data_size(to_page);
+	ulint max_ins_size =
+		page_get_max_insert_size(to_page, n_recs);
+	ulint max_ins_size_reorg =
+		page_get_max_insert_size_after_reorganize(
+			to_page, n_recs);
+	ulint max_ins_size_to_use = max_ins_size_reorg > reserved_space
+				    ? max_ins_size_reorg - reserved_space : 0;
+	ulint move_size = 0;
+	ulint n_recs_to_move = 0;
+	rec_t* rec = NULL;
+	ulint target_n_recs = 0;
+	rec_t* orig_pred;
+
+	// Estimate how many records can be moved from the from_page to
+	// the to_page.
+	if (zip_size) {
+		ulint page_diff = srv_page_size - *max_data_size;
+		max_ins_size_to_use = (max_ins_size_to_use > page_diff)
+			       ? max_ins_size_to_use - page_diff : 0;
+	}
+	n_recs_to_move = btr_defragment_calc_n_recs_for_size(
+		from_block, index, max_ins_size_to_use, &move_size);
+
+	// If max_ins_size >= move_size, we can move the records without
+	// reorganizing the page, otherwise we need to reorganize the page
+	// first to release more space.
+	if (move_size > max_ins_size) {
+		if (!btr_page_reorganize_block(page_zip_level,
+					       to_block, index,
+					       mtr)) {
+			if (!dict_index_is_clust(index)
+			    && page_is_leaf(to_page)) {
+				ibuf_reset_free_bits(to_block);
+			}
+			// If reorganization fails, that means page is
+			// not compressable. There's no point to try
+			// merging into this page. Continue to the
+			// next page.
+			return from_block;
+		}
+		ut_ad(page_validate(to_page, index));
+		max_ins_size = page_get_max_insert_size(to_page, n_recs);
+		ut_a(max_ins_size >= move_size);
+	}
+
+	// Move records to pack to_page more full.
+	orig_pred = NULL;
+	target_n_recs = n_recs_to_move;
+	while (n_recs_to_move > 0) {
+		rec = page_rec_get_nth(from_page,
+					n_recs_to_move + 1);
+		orig_pred = page_copy_rec_list_start(
+			to_block, from_block, rec, index, mtr);
+		if (orig_pred)
+			break;
+		// If we reach here, that means compression failed after packing
+		// n_recs_to_move number of records to to_page. We try to reduce
+		// the targeted data size on the to_page by
+		// BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again.
+		btr_defragment_compression_failures++;
+		max_ins_size_to_use =
+			move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
+			? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
+			: 0;
+		if (max_ins_size_to_use == 0) {
+			n_recs_to_move = 0;
+			move_size = 0;
+			break;
+		}
+		n_recs_to_move = btr_defragment_calc_n_recs_for_size(
+			from_block, index, max_ins_size_to_use, &move_size);
+	}
+	// If less than target_n_recs are moved, it means there are
+	// compression failures during page_copy_rec_list_start. Adjust
+	// the max_data_size estimation to reduce compression failures
+	// in the following runs.
+	if (target_n_recs > n_recs_to_move
+	    && *max_data_size > new_data_size + move_size) {
+		*max_data_size = new_data_size + move_size;
+	}
+	// Set ibuf free bits if necessary.
+	if (!dict_index_is_clust(index)
+	    && page_is_leaf(to_page)) {
+		if (zip_size) {
+			ibuf_reset_free_bits(to_block);
+		} else {
+			ibuf_update_free_bits_if_full(
+				to_block,
+				srv_page_size,
+				ULINT_UNDEFINED);
+		}
+	}
+	btr_cur_t parent;
+	if (n_recs_to_move == n_recs) {
+		/* The whole page is merged with the previous page,
+		free it. */
+		lock_update_merge_left(to_block, orig_pred,
+				       from_block);
+		btr_search_drop_page_hash_index(from_block);
+		btr_level_list_remove(*from_block, *index, mtr);
+		btr_page_get_father(index, from_block, mtr, &parent);
+		btr_cur_node_ptr_delete(&parent, mtr);
+		/* btr_blob_dbg_remove(from_page, index,
+		"btr_defragment_n_pages"); */
+		btr_page_free(index, from_block, mtr);
+	} else {
+		// There are still records left on the page, so
+		// increment n_defragmented. Node pointer will be changed
+		// so remove the old node pointer.
+		if (n_recs_to_move > 0) {
+			// Part of the page is merged to left, remove
+			// the merged records, update record locks and
+			// node pointer.
+			dtuple_t* node_ptr;
+			page_delete_rec_list_start(rec, from_block,
+						   index, mtr);
+			lock_update_split_and_merge(to_block,
+						    orig_pred,
+						    from_block);
+			// FIXME: reuse the node_ptr!
+			btr_page_get_father(index, from_block, mtr, &parent);
+			btr_cur_node_ptr_delete(&parent, mtr);
+			rec = page_rec_get_next(
+				page_get_infimum_rec(from_page));
+			node_ptr = dict_index_build_node_ptr(
+				index, rec, page_get_page_no(from_page),
+				heap, level);
+			btr_insert_on_non_leaf_level(0, index, level+1,
+						     node_ptr, mtr);
+		}
+		to_block = from_block;
+	}
+	return to_block;
+}
+
+/*************************************************************//**
+Tries to merge N consecutive pages, starting from the page pointed by the
+cursor. Skip space 0. Only consider leaf pages.
+This function first loads all N pages into memory, then for each of
+the pages other than the first page, it tries to move as many records
+as possible to the left sibling to keep the left sibling full. During
+the process, if any page becomes empty, that page will be removed from
+the level list. Record locks, hash, and node pointers are updated after
+page reorganization.
+@return pointer to the last block processed, or NULL if reaching end of index */
+UNIV_INTERN
+buf_block_t*
+btr_defragment_n_pages(
+	buf_block_t*	block,	/*!< in: starting block for defragmentation */
+	dict_index_t*	index,	/*!< in: index tree */
+	uint		n_pages,/*!< in: number of pages to defragment */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	/* We will need to load the n+1 block because if the last page is freed
+	and we need to modify the prev_page_no of that block. */
+	buf_block_t*	blocks[BTR_DEFRAGMENT_MAX_N_PAGES + 1];
+	page_t*		first_page;
+	buf_block_t*	current_block;
+	ulint		total_data_size = 0;
+	ulint		total_n_recs = 0;
+	ulint		data_size_per_rec;
+	ulint		optimal_page_size;
+	ulint		reserved_space;
+	ulint		max_data_size = 0;
+	uint		n_defragmented = 0;
+	uint		n_new_slots;
+	mem_heap_t*	heap;
+	ibool		end_of_index = FALSE;
+
+	/* It doesn't make sense to call this function with n_pages = 1. */
+	ut_ad(n_pages > 1);
+
+	if (!page_is_leaf(block->frame)) {
+		return NULL;
+	}
+
+	if (!index->table->space || !index->table->space_id) {
+		/* Ignore space 0. */
+		return NULL;
+	}
+
+	if (n_pages > BTR_DEFRAGMENT_MAX_N_PAGES) {
+		n_pages = BTR_DEFRAGMENT_MAX_N_PAGES;
+	}
+
+	first_page = buf_block_get_frame(block);
+	const ulint zip_size = index->table->space->zip_size();
+
+	/* 1. Load the pages and calculate the total data size. */
+	blocks[0] = block;
+	for (uint i = 1; i <= n_pages; i++) {
+		page_t* page = buf_block_get_frame(blocks[i-1]);
+		uint32_t page_no = btr_page_get_next(page);
+		total_data_size += page_get_data_size(page);
+		total_n_recs += page_get_n_recs(page);
+		if (page_no == FIL_NULL) {
+			n_pages = i;
+			end_of_index = TRUE;
+			break;
+		}
+
+		blocks[i] = btr_block_get(*index, page_no, RW_X_LATCH, true,
+					  mtr);
+	}
+
+	if (n_pages == 1) {
+		if (!page_has_prev(first_page)) {
+			/* last page in the index */
+			if (dict_index_get_page(index)
+			    == page_get_page_no(first_page))
+				return NULL;
+			/* given page is the last page.
+			Lift the records to father. */
+			btr_lift_page_up(index, block, mtr);
+		}
+		return NULL;
+	}
+
+	/* 2. Calculate how many pages data can fit in. If not compressable,
+	return early. */
+	ut_a(total_n_recs != 0);
+	data_size_per_rec = total_data_size / total_n_recs;
+	// For uncompressed pages, the optimal data size if the free space of a
+	// empty page.
+	optimal_page_size = page_get_free_space_of_empty(
+		page_is_comp(first_page));
+	// For compressed pages, we take compression failures into account.
+	if (zip_size) {
+		ulint size = 0;
+		uint i = 0;
+		// We estimate the optimal data size of the index use samples of
+		// data size. These samples are taken when pages failed to
+		// compress due to insertion on the page. We use the average
+		// of all samples we have as the estimation. Different pages of
+		// the same index vary in compressibility. Average gives a good
+		// enough estimation.
+		for (;i < STAT_DEFRAG_DATA_SIZE_N_SAMPLE; i++) {
+			if (index->stat_defrag_data_size_sample[i] == 0) {
+				break;
+			}
+			size += index->stat_defrag_data_size_sample[i];
+		}
+		if (i != 0) {
+			size /= i;
+			optimal_page_size = ut_min(optimal_page_size, size);
+		}
+		max_data_size = optimal_page_size;
+	}
+
+	reserved_space = ut_min(static_cast<ulint>(
+					static_cast<double>(optimal_page_size)
+					* (1 - srv_defragment_fill_factor)),
+			     (data_size_per_rec
+			      * srv_defragment_fill_factor_n_recs));
+	optimal_page_size -= reserved_space;
+	n_new_slots = uint((total_data_size + optimal_page_size - 1)
+			   / optimal_page_size);
+	if (n_new_slots >= n_pages) {
+		/* Can't defragment. */
+		if (end_of_index)
+			return NULL;
+		return blocks[n_pages-1];
+	}
+
+	/* 3. Defragment pages. */
+	heap = mem_heap_create(256);
+	// First defragmented page will be the first page.
+	current_block = blocks[0];
+	// Start from the second page.
+	for (uint i = 1; i < n_pages; i ++) {
+		buf_block_t* new_block = btr_defragment_merge_pages(
+			index, blocks[i], current_block, zip_size,
+			reserved_space, &max_data_size, heap, mtr);
+		if (new_block != current_block) {
+			n_defragmented ++;
+			current_block = new_block;
+		}
+	}
+	mem_heap_free(heap);
+	n_defragmented ++;
+	btr_defragment_count++;
+	if (n_pages == n_defragmented) {
+		btr_defragment_failures++;
+	} else {
+		index->stat_defrag_n_pages_freed += (n_pages - n_defragmented);
+	}
+	if (end_of_index)
+		return NULL;
+	return current_block;
+}
+
+
+
+void btr_defragment_start() {
+	if (!srv_defragment)
+		return;
+	ut_ad(!btr_defragment_wq.empty());
+	submit_defragment_task();
+}
+
+
+/**
+Callback used by defragment timer
+
+Throttling "sleep", is implemented via rescheduling the
+threadpool timer, which, when fired, will resume the work again,
+where it is left.
+
+The state (current item) is stored in function parameter.
+*/
+static void btr_defragment_chunk(void*)
+{
+	defragment_chunk_state_t* state = &defragment_chunk_state;
+
+	btr_pcur_t*	pcur;
+	btr_cur_t*	cursor;
+	dict_index_t*	index;
+	mtr_t		mtr;
+	buf_block_t*	first_block;
+	buf_block_t*	last_block;
+
+	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+		if (!state->m_item) {
+			state->m_item = btr_defragment_get_item();
+		}
+		/* If an index is marked as removed, we remove it from the work
+		queue. No other thread could be using this item at this point so
+		it's safe to remove now. */
+		while (state->m_item && state->m_item->removed) {
+			btr_defragment_remove_item(state->m_item);
+			state->m_item = btr_defragment_get_item();
+		}
+		if (!state->m_item) {
+			/* Queue empty */
+			return;
+		}
+
+		pcur = state->m_item->pcur;
+		ulonglong now = my_interval_timer();
+		ulonglong elapsed = now - state->m_item->last_processed;
+
+		if (elapsed < srv_defragment_interval) {
+			/* If we see an index again before the interval
+			determined by the configured frequency is reached,
+			we just sleep until the interval pass. Since
+			defragmentation of all indices queue up on a single
+			thread, it's likely other indices that follow this one
+			don't need to sleep again. */
+			int sleep_ms = (int)((srv_defragment_interval - elapsed) / 1000 / 1000);
+			if (sleep_ms) {
+				btr_defragment_timer->set_time(sleep_ms, 0);
+				return;
+			}
+		}
+		log_free_check();
+		mtr_start(&mtr);
+		cursor = btr_pcur_get_btr_cur(pcur);
+		index = btr_cur_get_index(cursor);
+		index->set_modified(mtr);
+		/* To follow the latching order defined in WL#6326, acquire index->lock X-latch.
+		This entitles us to acquire page latches in any order for the index. */
+		mtr_x_lock_index(index, &mtr);
+		/* This will acquire index->lock SX-latch, which per WL#6363 is allowed
+		when we are already holding the X-latch. */
+		btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, &mtr);
+		first_block = btr_cur_get_block(cursor);
+
+		last_block = btr_defragment_n_pages(first_block, index,
+						    srv_defragment_n_pages,
+						    &mtr);
+		if (last_block) {
+			/* If we haven't reached the end of the index,
+			place the cursor on the last record of last page,
+			store the cursor position, and put back in queue. */
+			page_t* last_page = buf_block_get_frame(last_block);
+			rec_t* rec = page_rec_get_prev(
+				page_get_supremum_rec(last_page));
+			ut_a(page_rec_is_user_rec(rec));
+			page_cur_position(rec, last_block,
+					  btr_cur_get_page_cur(cursor));
+			btr_pcur_store_position(pcur, &mtr);
+			mtr_commit(&mtr);
+			/* Update the last_processed time of this index. */
+			state->m_item->last_processed = now;
+		} else {
+			dberr_t err = DB_SUCCESS;
+			mtr_commit(&mtr);
+			/* Reaching the end of the index. */
+			dict_stats_empty_defrag_stats(index);
+			err = dict_stats_save_defrag_stats(index);
+			if (err != DB_SUCCESS) {
+				ib::error() << "Saving defragmentation stats for table "
+					    << index->table->name
+					    << " index " << index->name()
+					    << " failed with error " << err;
+			} else {
+				err = dict_stats_save_defrag_summary(index);
+
+				if (err != DB_SUCCESS) {
+					ib::error() << "Saving defragmentation summary for table "
+					    << index->table->name
+					    << " index " << index->name()
+					    << " failed with error " << err;
+				}
+			}
+
+			btr_defragment_remove_item(state->m_item);
+			state->m_item = NULL;
+		}
+	}
+}
diff --git a/storage/innobase/btr/btr0pcur.cc b/storage/innobase/btr/btr0pcur.cc
new file mode 100644
index 00000000..574998a9
--- /dev/null
+++ b/storage/innobase/btr/btr0pcur.cc
@@ -0,0 +1,681 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0pcur.cc
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+#include "btr0pcur.h"
+#include "ut0byte.h"
+#include "rem0cmp.h"
+#include "trx0trx.h"
+
+/**************************************************************//**
+Allocates memory for a persistent cursor object and initializes the cursor.
+@return own: persistent cursor */
+btr_pcur_t*
+btr_pcur_create_for_mysql(void)
+/*============================*/
+{
+	btr_pcur_t*	pcur;
+	DBUG_ENTER("btr_pcur_create_for_mysql");
+
+	pcur = (btr_pcur_t*) ut_malloc_nokey(sizeof(btr_pcur_t));
+
+	pcur->btr_cur.index = NULL;
+	btr_pcur_init(pcur);
+
+	DBUG_PRINT("btr_pcur_create_for_mysql", ("pcur: %p", pcur));
+	DBUG_RETURN(pcur);
+}
+
+/**************************************************************//**
+Resets a persistent cursor object, freeing ::old_rec_buf if it is
+allocated and resetting the other members to their initial values. */
+void
+btr_pcur_reset(
+/*===========*/
+	btr_pcur_t*	cursor)	/*!< in, out: persistent cursor */
+{
+	btr_pcur_free(cursor);
+	cursor->old_rec_buf = NULL;
+	cursor->btr_cur.index = NULL;
+	cursor->btr_cur.page_cur.rec = NULL;
+	cursor->old_rec = NULL;
+	cursor->old_n_core_fields = 0;
+	cursor->old_n_fields = 0;
+	cursor->old_stored = false;
+
+	cursor->latch_mode = BTR_NO_LATCHES;
+	cursor->pos_state = BTR_PCUR_NOT_POSITIONED;
+}
+
+/**************************************************************//**
+Frees the memory for a persistent cursor object. */
+void
+btr_pcur_free_for_mysql(
+/*====================*/
+	btr_pcur_t*	cursor)	/*!< in, own: persistent cursor */
+{
+	DBUG_ENTER("btr_pcur_free_for_mysql");
+	DBUG_PRINT("btr_pcur_free_for_mysql", ("pcur: %p", cursor));
+
+	btr_pcur_free(cursor);
+	ut_free(cursor);
+	DBUG_VOID_RETURN;
+}
+
+/**************************************************************//**
+The position of the cursor is stored by taking an initial segment of the
+record the cursor is positioned on, before, or after, and copying it to the
+cursor data structure, or just setting a flag if the cursor id before the
+first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the
+page where the cursor is positioned must not be empty if the index tree is
+not totally empty! */
+void
+btr_pcur_store_position(
+/*====================*/
+	btr_pcur_t*	cursor, /*!< in: persistent cursor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_cur_t*	page_cursor;
+	buf_block_t*	block;
+	rec_t*		rec;
+	dict_index_t*	index;
+	ulint		offs;
+
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	block = btr_pcur_get_block(cursor);
+	index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor));
+
+	page_cursor = btr_pcur_get_page_cur(cursor);
+
+	rec = page_cur_get_rec(page_cursor);
+	offs = rec - block->frame;
+	ut_ad(block->page.id().page_no() == page_get_page_no(block->frame));
+	ut_ad(block->page.buf_fix_count());
+	/* For spatial index, when we do positioning on parent
+	buffer if necessary, it might not hold latches, but the
+	tree must be locked to prevent change on the page */
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_S_FIX
+					 | MTR_MEMO_PAGE_X_FIX)
+	      || (index->is_spatial()
+		  && mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+						| MTR_MEMO_SX_LOCK)));
+
+	cursor->old_stored = true;
+
+	if (page_is_empty(block->frame)) {
+		/* It must be an empty index tree; NOTE that in this case
+		we do not store the modify_clock, but always do a search
+		if we restore the cursor position */
+
+		ut_a(!page_has_siblings(block->frame));
+		ut_ad(page_is_leaf(block->frame));
+		ut_ad(block->page.id().page_no() == index->page);
+
+		if (page_rec_is_supremum_low(offs)) {
+			cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE;
+		} else {
+before_first:
+			cursor->rel_pos = BTR_PCUR_BEFORE_FIRST_IN_TREE;
+		}
+
+		return;
+	}
+
+	if (page_rec_is_supremum_low(offs)) {
+		rec = page_rec_get_prev(rec);
+
+		ut_ad(!page_rec_is_infimum(rec));
+		if (UNIV_UNLIKELY(rec_is_metadata(rec, *index))) {
+#if 0 /* MDEV-22867 had to relax this */
+			/* If the table is emptied during an ALGORITHM=NOCOPY
+			DROP COLUMN ... that is not ALGORITHM=INSTANT,
+			then we must preserve any instant ADD metadata. */
+			ut_ad(index->table->instant
+			      || block->page.id().page_no() != index->page);
+#endif
+			ut_ad(index->is_instant()
+			      || block->page.id().page_no() != index->page);
+			ut_ad(page_get_n_recs(block->frame) == 1);
+			ut_ad(page_is_leaf(block->frame));
+			ut_ad(!page_has_prev(block->frame));
+			cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE;
+			return;
+		}
+
+		cursor->rel_pos = BTR_PCUR_AFTER;
+	} else if (page_rec_is_infimum_low(offs)) {
+		rec = page_rec_get_next(rec);
+
+		if (rec_is_metadata(rec, *index)) {
+			ut_ad(!page_has_prev(block->frame));
+			rec = page_rec_get_next(rec);
+			if (page_rec_is_supremum(rec)) {
+				goto before_first;
+			}
+		}
+
+		cursor->rel_pos = BTR_PCUR_BEFORE;
+	} else {
+		cursor->rel_pos = BTR_PCUR_ON;
+	}
+
+	if (index->is_ibuf()) {
+		ut_ad(!index->table->not_redundant());
+		cursor->old_n_fields = uint16_t(rec_get_n_fields_old(rec));
+	} else {
+		cursor->old_n_fields = static_cast<uint16>(
+			dict_index_get_n_unique_in_tree(index));
+		if (index->is_spatial() && !page_rec_is_leaf(rec)) {
+			ut_ad(dict_index_get_n_unique_in_tree_nonleaf(index)
+			      == DICT_INDEX_SPATIAL_NODEPTR_SIZE);
+			/* For R-tree, we have to compare
+			the child page numbers as well. */
+			cursor->old_n_fields
+				= DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1;
+		}
+	}
+
+	cursor->old_n_core_fields = index->n_core_fields;
+	cursor->old_rec = rec_copy_prefix_to_buf(rec, index,
+						 cursor->old_n_fields,
+						 &cursor->old_rec_buf,
+						 &cursor->buf_size);
+	cursor->block_when_stored.store(block);
+
+	/* Function try to check if block is S/X latch. */
+	cursor->modify_clock = buf_block_get_modify_clock(block);
+}
+
+/**************************************************************//**
+Copies the stored position of a pcur to another pcur. */
+void
+btr_pcur_copy_stored_position(
+/*==========================*/
+	btr_pcur_t*	pcur_receive,	/*!< in: pcur which will receive the
+					position info */
+	btr_pcur_t*	pcur_donate)	/*!< in: pcur from which the info is
+					copied */
+{
+	ut_free(pcur_receive->old_rec_buf);
+	memcpy(pcur_receive, pcur_donate, sizeof(btr_pcur_t));
+
+	if (pcur_donate->old_rec_buf) {
+
+		pcur_receive->old_rec_buf = (byte*)
+			ut_malloc_nokey(pcur_donate->buf_size);
+
+		memcpy(pcur_receive->old_rec_buf, pcur_donate->old_rec_buf,
+		       pcur_donate->buf_size);
+		pcur_receive->old_rec = pcur_receive->old_rec_buf
+			+ (pcur_donate->old_rec - pcur_donate->old_rec_buf);
+	}
+
+	pcur_receive->old_n_core_fields = pcur_donate->old_n_core_fields;
+	pcur_receive->old_n_fields = pcur_donate->old_n_fields;
+}
+
+/** Structure acts as functor to do the latching of leaf pages.
+It returns true if latching of leaf pages succeeded and false
+otherwise. */
+struct optimistic_latch_leaves
+{
+  btr_pcur_t *const cursor;
+  ulint *latch_mode;
+  mtr_t *const mtr;
+
+  optimistic_latch_leaves(btr_pcur_t *cursor, ulint *latch_mode, mtr_t *mtr)
+  :cursor(cursor), latch_mode(latch_mode), mtr(mtr) {}
+
+  bool operator() (buf_block_t *hint) const
+  {
+    return hint && btr_cur_optimistic_latch_leaves(
+             hint, cursor->modify_clock, latch_mode,
+             btr_pcur_get_btr_cur(cursor), __FILE__, __LINE__, mtr);
+  }
+};
+
+/**************************************************************//**
+Restores the stored position of a persistent cursor bufferfixing the page and
+obtaining the specified latches. If the cursor position was saved when the
+(1) cursor was positioned on a user record: this function restores the position
+to the last record LESS OR EQUAL to the stored record;
+(2) cursor was positioned on a page infimum record: restores the position to
+the last record LESS than the user record which was the successor of the page
+infimum;
+(3) cursor was positioned on the page supremum: restores to the first record
+GREATER than the user record which was the predecessor of the supremum.
+(4) cursor was positioned before the first or after the last in an empty tree:
+restores to before first or after the last in the tree.
+@return TRUE if the cursor position was stored when it was on a user
+record and it can be restored on a user record whose ordering fields
+are identical to the ones of the original user record */
+ibool
+btr_pcur_restore_position_func(
+/*===========================*/
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_pcur_t*	cursor,		/*!< in: detached persistent cursor */
+	const char*	file,		/*!< in: file name */
+	unsigned	line,		/*!< in: line where called */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	dict_index_t*	index;
+	dtuple_t*	tuple;
+	page_cur_mode_t	mode;
+	page_cur_mode_t	old_mode;
+	mem_heap_t*	heap;
+
+	ut_ad(mtr->is_active());
+	//ut_ad(cursor->old_stored);
+	ut_ad(cursor->pos_state == BTR_PCUR_WAS_POSITIONED
+	      || cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+	index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor));
+
+	if (UNIV_UNLIKELY
+	    (cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE
+	     || cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE)) {
+		dberr_t err = DB_SUCCESS;
+
+		/* In these cases we do not try an optimistic restoration,
+		but always do a search */
+
+		err = btr_cur_open_at_index_side(
+			cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE,
+			index, latch_mode,
+			btr_pcur_get_btr_cur(cursor), 0, mtr);
+
+		if (err != DB_SUCCESS) {
+			ib::warn() << " Error code: " << err
+				   << " btr_pcur_restore_position_func "
+				   << " called from file: "
+				   << file << " line: " << line
+				   << " table: " << index->table->name
+				   << " index: " << index->name;
+		}
+
+		cursor->latch_mode =
+			BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode);
+		cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+		cursor->block_when_stored.clear();
+
+		return(FALSE);
+	}
+
+	ut_a(cursor->old_rec);
+	ut_a(cursor->old_n_core_fields);
+	ut_a(cursor->old_n_core_fields <= index->n_core_fields);
+	ut_a(cursor->old_n_fields);
+
+	switch (latch_mode) {
+	case BTR_SEARCH_LEAF:
+	case BTR_MODIFY_LEAF:
+	case BTR_SEARCH_PREV:
+	case BTR_MODIFY_PREV:
+		/* Try optimistic restoration. */
+
+		if (cursor->block_when_stored.run_with_hint(
+			optimistic_latch_leaves(cursor, &latch_mode,
+						mtr))) {
+			cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+			cursor->latch_mode = latch_mode;
+
+			buf_block_dbg_add_level(
+				btr_pcur_get_block(cursor),
+				dict_index_is_ibuf(index)
+				? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
+
+			if (cursor->rel_pos == BTR_PCUR_ON) {
+#ifdef UNIV_DEBUG
+				const rec_t*	rec;
+				rec_offs	offsets1_[REC_OFFS_NORMAL_SIZE];
+				rec_offs	offsets2_[REC_OFFS_NORMAL_SIZE];
+				rec_offs*	offsets1 = offsets1_;
+				rec_offs*	offsets2 = offsets2_;
+				rec = btr_pcur_get_rec(cursor);
+
+				rec_offs_init(offsets1_);
+				rec_offs_init(offsets2_);
+
+				heap = mem_heap_create(256);
+				ut_ad(cursor->old_n_core_fields
+				      == index->n_core_fields);
+
+				offsets1 = rec_get_offsets(
+					cursor->old_rec, index, offsets1,
+					cursor->old_n_core_fields,
+					cursor->old_n_fields, &heap);
+				offsets2 = rec_get_offsets(
+					rec, index, offsets2,
+					index->n_core_fields,
+					cursor->old_n_fields, &heap);
+
+				ut_ad(!cmp_rec_rec(cursor->old_rec,
+						   rec, offsets1, offsets2,
+						   index));
+				mem_heap_free(heap);
+#endif /* UNIV_DEBUG */
+				return(TRUE);
+			}
+			/* This is the same record as stored,
+			may need to be adjusted for BTR_PCUR_BEFORE/AFTER,
+			depending on search mode and direction. */
+			if (btr_pcur_is_on_user_rec(cursor)) {
+				cursor->pos_state
+					= BTR_PCUR_IS_POSITIONED_OPTIMISTIC;
+			}
+			return(FALSE);
+		}
+	}
+
+	/* If optimistic restoration did not succeed, open the cursor anew */
+
+	heap = mem_heap_create(256);
+
+	tuple = dtuple_create(heap, cursor->old_n_fields);
+
+	dict_index_copy_types(tuple, index, cursor->old_n_fields);
+
+	rec_copy_prefix_to_dtuple(tuple, cursor->old_rec, index,
+				  cursor->old_n_core_fields,
+				  cursor->old_n_fields, heap);
+	ut_ad(dtuple_check_typed(tuple));
+
+	/* Save the old search mode of the cursor */
+	old_mode = cursor->search_mode;
+
+	switch (cursor->rel_pos) {
+	case BTR_PCUR_ON:
+		mode = PAGE_CUR_LE;
+		break;
+	case BTR_PCUR_AFTER:
+		mode = PAGE_CUR_G;
+		break;
+	case BTR_PCUR_BEFORE:
+		mode = PAGE_CUR_L;
+		break;
+	default:
+		ut_error;
+		mode = PAGE_CUR_UNSUPP;
+	}
+
+	btr_pcur_open_with_no_init_func(index, tuple, mode, latch_mode,
+					cursor,
+#ifdef BTR_CUR_HASH_ADAPT
+					NULL,
+#endif /* BTR_CUR_HASH_ADAPT */
+					file, line, mtr);
+
+	/* Restore the old search mode */
+	cursor->search_mode = old_mode;
+
+	ut_ad(cursor->rel_pos == BTR_PCUR_ON
+	      || cursor->rel_pos == BTR_PCUR_BEFORE
+	      || cursor->rel_pos == BTR_PCUR_AFTER);
+	rec_offs offsets[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets);
+	if (cursor->rel_pos == BTR_PCUR_ON
+	    && btr_pcur_is_on_user_rec(cursor)
+	    && !cmp_dtuple_rec(tuple, btr_pcur_get_rec(cursor),
+			       rec_get_offsets(btr_pcur_get_rec(cursor),
+					       index, offsets,
+					       index->n_core_fields,
+					       ULINT_UNDEFINED, &heap))) {
+
+		/* We have to store the NEW value for the modify clock,
+		since the cursor can now be on a different page!
+		But we can retain the value of old_rec */
+
+		cursor->block_when_stored.store(btr_pcur_get_block(cursor));
+		cursor->modify_clock = buf_block_get_modify_clock(
+					cursor->block_when_stored.block());
+		cursor->old_stored = true;
+
+		mem_heap_free(heap);
+
+		return(TRUE);
+	}
+
+	mem_heap_free(heap);
+
+	/* We have to store new position information, modify_clock etc.,
+	to the cursor because it can now be on a different page, the record
+	under it may have been removed, etc. */
+
+	btr_pcur_store_position(cursor, mtr);
+
+	return(FALSE);
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the first record on the next page. Releases the
+latch on the current page, and bufferunfixes it. Note that there must not be
+modifications on the current page, as then the x-latch can be released only in
+mtr_commit. */
+void
+btr_pcur_move_to_next_page(
+/*=======================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; must be on the
+				last record of the current page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+	ut_ad(btr_pcur_is_after_last_on_page(cursor));
+
+	cursor->old_stored = false;
+
+	const page_t* page = btr_pcur_get_page(cursor);
+
+	if (UNIV_UNLIKELY(!page)) {
+		return;
+	}
+
+	const uint32_t next_page_no = btr_page_get_next(page);
+
+	ut_ad(next_page_no != FIL_NULL);
+
+	ulint mode = cursor->latch_mode;
+	switch (mode) {
+	case BTR_SEARCH_TREE:
+		mode = BTR_SEARCH_LEAF;
+		break;
+	case BTR_MODIFY_TREE:
+		mode = BTR_MODIFY_LEAF;
+	}
+
+	buf_block_t* next_block = btr_block_get(
+		*btr_pcur_get_btr_cur(cursor)->index, next_page_no, mode,
+		page_is_leaf(page), mtr);
+
+	if (UNIV_UNLIKELY(!next_block)) {
+		return;
+	}
+
+	const page_t* next_page = buf_block_get_frame(next_block);
+#ifdef UNIV_BTR_DEBUG
+	ut_a(page_is_comp(next_page) == page_is_comp(page));
+	ut_a(btr_page_get_prev(next_page)
+	     == btr_pcur_get_block(cursor)->page.id().page_no());
+#endif /* UNIV_BTR_DEBUG */
+
+	btr_leaf_page_release(btr_pcur_get_block(cursor), mode, mtr);
+
+	page_cur_set_before_first(next_block, btr_pcur_get_page_cur(cursor));
+
+	ut_d(page_check_dir(next_page));
+}
+
+/*********************************************************//**
+Moves the persistent cursor backward if it is on the first record of the page.
+Commits mtr. Note that to prevent a possible deadlock, the operation
+first stores the position of the cursor, commits mtr, acquires the necessary
+latches and restores the cursor position again before returning. The
+alphabetical position of the cursor is guaranteed to be sensible on
+return, but it may happen that the cursor is not positioned on the last
+record of any page, because the structure of the tree may have changed
+during the time when the cursor had no latches. */
+static
+void
+btr_pcur_move_backward_from_page(
+/*=============================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor, must be on the first
+				record of the current page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint		prev_page_no;
+	page_t*		page;
+	buf_block_t*	prev_block;
+	ulint		latch_mode;
+	ulint		latch_mode2;
+
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+	ut_ad(btr_pcur_is_before_first_on_page(cursor));
+	ut_ad(!btr_pcur_is_before_first_in_tree(cursor));
+
+	latch_mode = cursor->latch_mode;
+
+	if (latch_mode == BTR_SEARCH_LEAF) {
+
+		latch_mode2 = BTR_SEARCH_PREV;
+
+	} else if (latch_mode == BTR_MODIFY_LEAF) {
+
+		latch_mode2 = BTR_MODIFY_PREV;
+	} else {
+		latch_mode2 = 0; /* To eliminate compiler warning */
+		ut_error;
+	}
+
+	btr_pcur_store_position(cursor, mtr);
+
+	mtr_commit(mtr);
+
+	mtr_start(mtr);
+
+	btr_pcur_restore_position(latch_mode2, cursor, mtr);
+
+	page = btr_pcur_get_page(cursor);
+
+	prev_page_no = btr_page_get_prev(page);
+
+	if (prev_page_no == FIL_NULL) {
+	} else if (btr_pcur_is_before_first_on_page(cursor)) {
+
+		prev_block = btr_pcur_get_btr_cur(cursor)->left_block;
+
+		btr_leaf_page_release(btr_pcur_get_block(cursor),
+				      latch_mode, mtr);
+
+		page_cur_set_after_last(prev_block,
+					btr_pcur_get_page_cur(cursor));
+	} else {
+
+		/* The repositioned cursor did not end on an infimum
+		record on a page. Cursor repositioning acquired a latch
+		also on the previous page, but we do not need the latch:
+		release it. */
+
+		prev_block = btr_pcur_get_btr_cur(cursor)->left_block;
+
+		btr_leaf_page_release(prev_block, latch_mode, mtr);
+	}
+
+	cursor->latch_mode = latch_mode;
+	cursor->old_stored = false;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the previous record in the tree. If no records
+are left, the cursor stays 'before first in tree'.
+@return TRUE if the cursor was not before first in tree */
+ibool
+btr_pcur_move_to_prev(
+/*==================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	cursor->old_stored = false;
+
+	if (btr_pcur_is_before_first_on_page(cursor)) {
+
+		if (btr_pcur_is_before_first_in_tree(cursor)) {
+
+			return(FALSE);
+		}
+
+		btr_pcur_move_backward_from_page(cursor, mtr);
+
+		return(TRUE);
+	}
+
+	btr_pcur_move_to_prev_on_page(cursor);
+
+	return(TRUE);
+}
+
+/**************************************************************//**
+If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first
+user record satisfying the search condition, in the case PAGE_CUR_L or
+PAGE_CUR_LE, on the last user record. If no such user record exists, then
+in the first case sets the cursor after last in tree, and in the latter case
+before first in tree. The latching mode must be BTR_SEARCH_LEAF or
+BTR_MODIFY_LEAF. */
+void
+btr_pcur_open_on_user_rec_func(
+/*===========================*/
+	dict_index_t*	index,		/*!< in: index */
+	const dtuple_t*	tuple,		/*!< in: tuple on which search done */
+	page_cur_mode_t	mode,		/*!< in: PAGE_CUR_L, ... */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF or
+					BTR_MODIFY_LEAF */
+	btr_pcur_t*	cursor,		/*!< in: memory buffer for persistent
+					cursor */
+	const char*	file,		/*!< in: file name */
+	unsigned	line,		/*!< in: line where called */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	btr_pcur_open_low(index, 0, tuple, mode, latch_mode, cursor,
+			  file, line, 0, mtr);
+
+	if ((mode == PAGE_CUR_GE) || (mode == PAGE_CUR_G)) {
+
+		if (btr_pcur_is_after_last_on_page(cursor)) {
+
+			btr_pcur_move_to_next_user_rec(cursor, mtr);
+		}
+	} else {
+		ut_ad((mode == PAGE_CUR_LE) || (mode == PAGE_CUR_L));
+
+		/* Not implemented yet */
+
+		ut_error;
+	}
+}
diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc
new file mode 100644
index 00000000..f22e3a59
--- /dev/null
+++ b/storage/innobase/btr/btr0sea.cc
@@ -0,0 +1,2372 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file btr/btr0sea.cc
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "btr0sea.h"
+#ifdef BTR_CUR_HASH_ADAPT
+#include "buf0buf.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "btr0cur.h"
+#include "btr0pcur.h"
+#include "btr0btr.h"
+#include "srv0mon.h"
+
+/** Is search system enabled.
+Search system is protected by array of latches. */
+char		btr_search_enabled;
+
+/** Number of adaptive hash index partition. */
+ulong		btr_ahi_parts;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+/** Number of successful adaptive hash index lookups */
+ulint		btr_search_n_succ	= 0;
+/** Number of failed adaptive hash index lookups */
+ulint		btr_search_n_hash_fail	= 0;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+/** The adaptive hash index */
+btr_search_sys_t btr_search_sys;
+
+/** If the number of records on the page divided by this parameter
+would have been successfully accessed using a hash index, the index
+is then built on the page, assuming the global limit has been reached */
+#define BTR_SEARCH_PAGE_BUILD_LIMIT	16U
+
+/** The global limit for consecutive potentially successful hash searches,
+before hash index building is started */
+#define BTR_SEARCH_BUILD_LIMIT		100U
+
+/** Compute a hash value of a record in a page.
+@param[in]	rec		index record
+@param[in]	offsets		return value of rec_get_offsets()
+@param[in]	n_fields	number of complete fields to fold
+@param[in]	n_bytes		number of bytes to fold in the last field
+@param[in]	index_id	index tree ID
+@return the hash value */
+static inline
+ulint
+rec_fold(
+	const rec_t*	rec,
+	const rec_offs*	offsets,
+	ulint		n_fields,
+	ulint		n_bytes,
+	index_id_t	tree_id)
+{
+	ulint		i;
+	const byte*	data;
+	ulint		len;
+	ulint		fold;
+	ulint		n_fields_rec;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(rec_validate(rec, offsets));
+	ut_ad(page_rec_is_leaf(rec));
+	ut_ad(!page_rec_is_metadata(rec));
+	ut_ad(n_fields > 0 || n_bytes > 0);
+
+	n_fields_rec = rec_offs_n_fields(offsets);
+	ut_ad(n_fields <= n_fields_rec);
+	ut_ad(n_fields < n_fields_rec || n_bytes == 0);
+
+	if (n_fields > n_fields_rec) {
+		n_fields = n_fields_rec;
+	}
+
+	if (n_fields == n_fields_rec) {
+		n_bytes = 0;
+	}
+
+	fold = ut_fold_ull(tree_id);
+
+	for (i = 0; i < n_fields; i++) {
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		if (len != UNIV_SQL_NULL) {
+			fold = ut_fold_ulint_pair(fold,
+						  ut_fold_binary(data, len));
+		}
+	}
+
+	if (n_bytes > 0) {
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		if (len != UNIV_SQL_NULL) {
+			if (len > n_bytes) {
+				len = n_bytes;
+			}
+
+			fold = ut_fold_ulint_pair(fold,
+						  ut_fold_binary(data, len));
+		}
+	}
+
+	return(fold);
+}
+
+/** Determine the number of accessed key fields.
+@param[in]	n_fields	number of complete fields
+@param[in]	n_bytes		number of bytes in an incomplete last field
+@return	number of complete or incomplete fields */
+inline MY_ATTRIBUTE((warn_unused_result))
+ulint
+btr_search_get_n_fields(
+	ulint	n_fields,
+	ulint	n_bytes)
+{
+	return(n_fields + (n_bytes > 0 ? 1 : 0));
+}
+
+/** Determine the number of accessed key fields.
+@param[in]	cursor		b-tree cursor
+@return	number of complete or incomplete fields */
+inline MY_ATTRIBUTE((warn_unused_result))
+ulint
+btr_search_get_n_fields(
+	const btr_cur_t*	cursor)
+{
+	return(btr_search_get_n_fields(cursor->n_fields, cursor->n_bytes));
+}
+
+/** This function should be called before reserving any btr search mutex, if
+the intended operation might add nodes to the search system hash table.
+Because of the latching order, once we have reserved the btr search system
+latch, we cannot allocate a free frame from the buffer pool. Checks that
+there is a free buffer frame allocated for hash table heap in the btr search
+system. If not, allocates a free frames for the heap. This check makes it
+probable that, when have reserved the btr search system latch and we need to
+allocate a new node to the hash table, it will succeed. However, the check
+will not guarantee success.
+@param[in]	index	index handler */
+static void btr_search_check_free_space_in_heap(const dict_index_t *index)
+{
+  /* Note that we peek the value of heap->free_block without reserving
+  the latch: this is ok, because we will not guarantee that there will
+  be enough free space in the hash table. */
+
+  buf_block_t *block= buf_block_alloc();
+  auto part= btr_search_sys.get_part(*index);
+
+  rw_lock_x_lock(&part->latch);
+
+  if (!btr_search_enabled || part->heap->free_block)
+    buf_block_free(block);
+  else
+    part->heap->free_block= block;
+
+  rw_lock_x_unlock(&part->latch);
+}
+
+/** Set index->ref_count = 0 on all indexes of a table.
+@param[in,out]	table	table handler */
+static void btr_search_disable_ref_count(dict_table_t *table)
+{
+  for (dict_index_t *index= dict_table_get_first_index(table); index;
+       index= dict_table_get_next_index(index))
+    index->search_info->ref_count= 0;
+}
+
+/** Lazily free detached metadata when removing the last reference. */
+ATTRIBUTE_COLD static void btr_search_lazy_free(dict_index_t *index)
+{
+  ut_ad(index->freed());
+  dict_table_t *table= index->table;
+  /* Perform the skipped steps of dict_index_remove_from_cache_low(). */
+  UT_LIST_REMOVE(table->freed_indexes, index);
+  rw_lock_free(&index->lock);
+  dict_mem_index_free(index);
+
+  if (!UT_LIST_GET_LEN(table->freed_indexes) &&
+      !UT_LIST_GET_LEN(table->indexes))
+  {
+    ut_ad(table->id == 0);
+    dict_mem_table_free(table);
+  }
+}
+
+/** Disable the adaptive hash search system and empty the index. */
+void btr_search_disable()
+{
+	dict_table_t*	table;
+
+	mutex_enter(&dict_sys.mutex);
+
+	btr_search_x_lock_all();
+
+	if (!btr_search_enabled) {
+		mutex_exit(&dict_sys.mutex);
+		btr_search_x_unlock_all();
+		return;
+	}
+
+	btr_search_enabled = false;
+
+	/* Clear the index->search_info->ref_count of every index in
+	the data dictionary cache. */
+	for (table = UT_LIST_GET_FIRST(dict_sys.table_LRU); table;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+		btr_search_disable_ref_count(table);
+	}
+
+	for (table = UT_LIST_GET_FIRST(dict_sys.table_non_LRU); table;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+		btr_search_disable_ref_count(table);
+	}
+
+	mutex_exit(&dict_sys.mutex);
+
+	/* Set all block->index = NULL. */
+	buf_pool.clear_hash_index();
+
+	/* Clear the adaptive hash index. */
+	btr_search_sys.clear();
+
+	btr_search_x_unlock_all();
+}
+
+/** Enable the adaptive hash search system.
+@param resize whether buf_pool_t::resize() is the caller */
+void btr_search_enable(bool resize)
+{
+	if (!resize) {
+		mysql_mutex_lock(&buf_pool.mutex);
+		bool changed = srv_buf_pool_old_size != srv_buf_pool_size;
+		mysql_mutex_unlock(&buf_pool.mutex);
+		if (changed) {
+			return;
+		}
+	}
+
+	btr_search_x_lock_all();
+	ulint hash_size = buf_pool_get_curr_size() / sizeof(void *) / 64;
+
+	if (btr_search_sys.parts[0].heap) {
+		ut_ad(btr_search_enabled);
+		btr_search_x_unlock_all();
+		return;
+	}
+
+	btr_search_sys.alloc(hash_size);
+
+	btr_search_enabled = true;
+	btr_search_x_unlock_all();
+}
+
+/** Updates the search info of an index about hash successes. NOTE that info
+is NOT protected by any semaphore, to save CPU time! Do not assume its fields
+are consistent.
+@param[in,out]	info	search info
+@param[in]	cursor	cursor which was just positioned */
+static
+void
+btr_search_info_update_hash(
+	btr_search_t*	info,
+	btr_cur_t*	cursor)
+{
+	dict_index_t*	index = cursor->index;
+	int		cmp;
+
+	ut_ad(!btr_search_own_any(RW_LOCK_S));
+	ut_ad(!btr_search_own_any(RW_LOCK_X));
+
+	if (dict_index_is_ibuf(index)) {
+		/* So many deletes are performed on an insert buffer tree
+		that we do not consider a hash index useful on it: */
+
+		return;
+	}
+
+	uint16_t n_unique = dict_index_get_n_unique_in_tree(index);
+
+	if (info->n_hash_potential == 0) {
+
+		goto set_new_recomm;
+	}
+
+	/* Test if the search would have succeeded using the recommended
+	hash prefix */
+
+	if (info->n_fields >= n_unique && cursor->up_match >= n_unique) {
+increment_potential:
+		info->n_hash_potential++;
+
+		return;
+	}
+
+	cmp = ut_pair_cmp(info->n_fields, info->n_bytes,
+			  cursor->low_match, cursor->low_bytes);
+
+	if (info->left_side ? cmp <= 0 : cmp > 0) {
+
+		goto set_new_recomm;
+	}
+
+	cmp = ut_pair_cmp(info->n_fields, info->n_bytes,
+			  cursor->up_match, cursor->up_bytes);
+
+	if (info->left_side ? cmp <= 0 : cmp > 0) {
+
+		goto increment_potential;
+	}
+
+set_new_recomm:
+	/* We have to set a new recommendation; skip the hash analysis
+	for a while to avoid unnecessary CPU time usage when there is no
+	chance for success */
+
+	info->hash_analysis = 0;
+
+	cmp = ut_pair_cmp(cursor->up_match, cursor->up_bytes,
+			  cursor->low_match, cursor->low_bytes);
+	info->left_side = cmp >= 0;
+	info->n_hash_potential = cmp != 0;
+
+	if (cmp == 0) {
+		/* For extra safety, we set some sensible values here */
+		info->n_fields = 1;
+		info->n_bytes = 0;
+	} else if (cmp > 0) {
+		info->n_hash_potential = 1;
+
+		if (cursor->up_match >= n_unique) {
+
+			info->n_fields = n_unique;
+			info->n_bytes = 0;
+
+		} else if (cursor->low_match < cursor->up_match) {
+
+			info->n_fields = static_cast<uint16_t>(
+				cursor->low_match + 1);
+			info->n_bytes = 0;
+		} else {
+			info->n_fields = static_cast<uint16_t>(
+				cursor->low_match);
+			info->n_bytes = static_cast<uint16_t>(
+				cursor->low_bytes + 1);
+		}
+	} else {
+		if (cursor->low_match >= n_unique) {
+
+			info->n_fields = n_unique;
+			info->n_bytes = 0;
+		} else if (cursor->low_match > cursor->up_match) {
+
+			info->n_fields = static_cast<uint16_t>(
+				cursor->up_match + 1);
+			info->n_bytes = 0;
+		} else {
+			info->n_fields = static_cast<uint16_t>(
+				cursor->up_match);
+			info->n_bytes = static_cast<uint16_t>(
+				cursor->up_bytes + 1);
+		}
+	}
+}
+
+/** Update the block search info on hash successes. NOTE that info and
+block->n_hash_helps, n_fields, n_bytes, left_side are NOT protected by any
+semaphore, to save CPU time! Do not assume the fields are consistent.
+@return TRUE if building a (new) hash index on the block is recommended
+@param[in,out]	info	search info
+@param[in,out]	block	buffer block */
+static
+bool
+btr_search_update_block_hash_info(btr_search_t* info, buf_block_t* block)
+{
+	ut_ad(!btr_search_own_any());
+	ut_ad(rw_lock_own_flagged(&block->lock,
+				  RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
+
+	info->last_hash_succ = FALSE;
+	ut_d(auto state= block->page.state());
+	ut_ad(state == BUF_BLOCK_NOT_USED
+	      || state == BUF_BLOCK_FILE_PAGE
+	      || state == BUF_BLOCK_MEMORY
+	      || state == BUF_BLOCK_REMOVE_HASH);
+	ut_ad(info->magic_n == BTR_SEARCH_MAGIC_N);
+
+	if ((block->n_hash_helps > 0)
+	    && (info->n_hash_potential > 0)
+	    && (block->n_fields == info->n_fields)
+	    && (block->n_bytes == info->n_bytes)
+	    && (block->left_side == info->left_side)) {
+
+		if ((block->index)
+		    && (block->curr_n_fields == info->n_fields)
+		    && (block->curr_n_bytes == info->n_bytes)
+		    && (block->curr_left_side == info->left_side)) {
+
+			/* The search would presumably have succeeded using
+			the hash index */
+
+			info->last_hash_succ = TRUE;
+		}
+
+		block->n_hash_helps++;
+	} else {
+		block->n_hash_helps = 1;
+		block->n_fields = info->n_fields;
+		block->n_bytes = info->n_bytes;
+		block->left_side = info->left_side;
+	}
+
+	if ((block->n_hash_helps > page_get_n_recs(block->frame)
+	     / BTR_SEARCH_PAGE_BUILD_LIMIT)
+	    && (info->n_hash_potential >= BTR_SEARCH_BUILD_LIMIT)) {
+
+		if ((!block->index)
+		    || (block->n_hash_helps
+			> 2U * page_get_n_recs(block->frame))
+		    || (block->n_fields != block->curr_n_fields)
+		    || (block->n_bytes != block->curr_n_bytes)
+		    || (block->left_side != block->curr_left_side)) {
+
+			/* Build a new hash index on the page */
+
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/** Maximum number of records in a page */
+constexpr ulint MAX_N_POINTERS = UNIV_PAGE_SIZE_MAX / REC_N_NEW_EXTRA_BYTES;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+__attribute__((nonnull))
+/**
+Insert an entry into the hash table. If an entry with the same fold number
+is found, its node is updated to point to the new data, and no new node
+is inserted.
+@param table hash table
+@param heap  memory heap
+@param fold  folded value of the record
+@param block buffer block containing the record
+@param data  the record
+@retval true on success
+@retval false if no more memory could be allocated */
+static bool ha_insert_for_fold(hash_table_t *table, mem_heap_t* heap,
+                               ulint fold,
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+                               buf_block_t *block, /*!< buffer block of data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+                               const rec_t *data)
+{
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+  ut_a(block->frame == page_align(data));
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+  ut_ad(btr_search_enabled);
+
+  hash_cell_t *cell= &table->array[table->calc_hash(fold)];
+
+  for (ha_node_t *prev= static_cast<ha_node_t*>(cell->node); prev;
+       prev= prev->next)
+  {
+    if (prev->fold == fold)
+    {
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+      buf_block_t *prev_block= prev->block;
+      ut_a(prev_block->frame == page_align(prev->data));
+      ut_a(prev_block->n_pointers-- < MAX_N_POINTERS);
+      ut_a(block->n_pointers++ < MAX_N_POINTERS);
+
+      prev->block= block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+      prev->data= data;
+      return true;
+    }
+  }
+
+  /* We have to allocate a new chain node */
+  ha_node_t *node= static_cast<ha_node_t*>(mem_heap_alloc(heap, sizeof *node));
+
+  if (!node)
+    return false;
+
+  ha_node_set_data(node, block, data);
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+  ut_a(block->n_pointers++ < MAX_N_POINTERS);
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+  node->fold= fold;
+  node->next= nullptr;
+
+  ha_node_t *prev= static_cast<ha_node_t*>(cell->node);
+  if (!prev)
+    cell->node= node;
+  else
+  {
+    while (prev->next)
+      prev= prev->next;
+    prev->next= node;
+  }
+  return true;
+}
+
+__attribute__((nonnull))
+/** Delete a record.
+@param table     hash table
+@param heap      memory heap
+@param del_node  record to be deleted */
+static void ha_delete_hash_node(hash_table_t *table, mem_heap_t *heap,
+                                ha_node_t *del_node)
+{
+  ut_ad(btr_search_enabled);
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+  ut_a(del_node->block->frame == page_align(del_node->data));
+  ut_a(del_node->block->n_pointers-- < MAX_N_POINTERS);
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+  const ulint fold= del_node->fold;
+
+  HASH_DELETE(ha_node_t, next, table, fold, del_node);
+
+  ha_node_t *top= static_cast<ha_node_t*>(mem_heap_get_top(heap, sizeof *top));
+
+  if (del_node != top)
+  {
+    /* Compact the heap of nodes by moving the top in the place of del_node. */
+    *del_node= *top;
+    hash_cell_t *cell= &table->array[table->calc_hash(top->fold)];
+
+    /* Look for the pointer to the top node, to update it */
+    if (cell->node == top)
+      /* The top node is the first in the chain */
+      cell->node= del_node;
+    else
+    {
+      /* We have to look for the predecessor */
+      ha_node_t *node= static_cast<ha_node_t*>(cell->node);
+
+      while (top != HASH_GET_NEXT(next, node))
+        node= static_cast<ha_node_t*>(HASH_GET_NEXT(next, node));
+
+      /* Now we have the predecessor node */
+      node->next= del_node;
+    }
+  }
+
+  /* Free the occupied space */
+  mem_heap_free_top(heap, sizeof *top);
+}
+
+__attribute__((nonnull))
+/** Delete all pointers to a page.
+@param table     hash table
+@param heap      memory heap
+@param page      record to be deleted */
+static void ha_remove_all_nodes_to_page(hash_table_t *table, mem_heap_t *heap,
+                                        ulint fold, const page_t *page)
+{
+  for (ha_node_t *node= ha_chain_get_first(table, fold); node; )
+  {
+    if (page_align(ha_node_get_data(node)) == page)
+    {
+      ha_delete_hash_node(table, heap, node);
+      /* The deletion may compact the heap of nodes and move other nodes! */
+      node= ha_chain_get_first(table, fold);
+    }
+    else
+      node= ha_chain_get_next(node);
+  }
+#ifdef UNIV_DEBUG
+  /* Check that all nodes really got deleted */
+  for (ha_node_t *node= ha_chain_get_first(table, fold); node;
+       node= ha_chain_get_next(node))
+    ut_ad(page_align(ha_node_get_data(node)) != page);
+#endif /* UNIV_DEBUG */
+}
+
+/** Delete a record if found.
+@param table     hash table
+@param heap      memory heap for the hash bucket chain
+@param fold      folded value of the searched data
+@param data      pointer to the record
+@return whether the record was found */
+static bool ha_search_and_delete_if_found(hash_table_t *table,
+                                          mem_heap_t *heap,
+                                          ulint fold, const rec_t *data)
+{
+  if (ha_node_t *node= ha_search_with_data(table, fold, data))
+  {
+    ha_delete_hash_node(table, heap, node);
+    return true;
+  }
+
+  return false;
+}
+
+__attribute__((nonnull))
+/** Looks for an element when we know the pointer to the data and
+updates the pointer to data if found.
+@param table     hash table
+@param fold      folded value of the searched data
+@param data      pointer to the data
+@param new_data  new pointer to the data
+@return whether the element was found */
+static bool ha_search_and_update_if_found(hash_table_t *table, ulint fold,
+                                          const rec_t *data,
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+                                          /** block containing new_data */
+                                          buf_block_t *new_block,
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+                                          const rec_t *new_data)
+{
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+  ut_a(new_block->frame == page_align(new_data));
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+  if (!btr_search_enabled)
+    return false;
+
+  if (ha_node_t *node= ha_search_with_data(table, fold, data))
+  {
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+    ut_a(node->block->n_pointers-- < MAX_N_POINTERS);
+    ut_a(new_block->n_pointers++ < MAX_N_POINTERS);
+    node->block= new_block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+    node->data= new_data;
+
+    return true;
+  }
+
+  return false;
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+#else
+# define ha_insert_for_fold(t,h,f,b,d) ha_insert_for_fold(t,h,f,d)
+# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \
+	ha_search_and_update_if_found(table,fold,data,new_data)
+#endif
+
+/** Updates a hash node reference when it has been unsuccessfully used in a
+search which could have succeeded with the used hash parameters. This can
+happen because when building a hash index for a page, we do not check
+what happens at page boundaries, and therefore there can be misleading
+hash nodes. Also, collisions in the fold value can lead to misleading
+references. This function lazily fixes these imperfections in the hash
+index.
+@param[in]	info	search info
+@param[in]	block	buffer block where cursor positioned
+@param[in]	cursor	cursor */
+static
+void
+btr_search_update_hash_ref(
+	const btr_search_t*	info,
+	buf_block_t*		block,
+	const btr_cur_t*	cursor)
+{
+	ut_ad(cursor->flag == BTR_CUR_HASH_FAIL);
+
+	ut_ad(rw_lock_own_flagged(&block->lock,
+				  RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
+	ut_ad(page_align(btr_cur_get_rec(cursor)) == block->frame);
+	ut_ad(page_is_leaf(block->frame));
+	assert_block_ahi_valid(block);
+
+	dict_index_t* index = block->index;
+
+	if (!index || !info->n_hash_potential) {
+		return;
+	}
+
+	if (index != cursor->index) {
+		ut_ad(index->id == cursor->index->id);
+		btr_search_drop_page_hash_index(block);
+		return;
+	}
+
+	ut_ad(block->page.id().space() == index->table->space_id);
+	ut_ad(index == cursor->index);
+	ut_ad(!dict_index_is_ibuf(index));
+	auto part = btr_search_sys.get_part(*index);
+	rw_lock_x_lock(&part->latch);
+	ut_ad(!block->index || block->index == index);
+
+	if (block->index
+	    && (block->curr_n_fields == info->n_fields)
+	    && (block->curr_n_bytes == info->n_bytes)
+	    && (block->curr_left_side == info->left_side)
+	    && btr_search_enabled) {
+		mem_heap_t*	heap		= NULL;
+		rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+		rec_offs_init(offsets_);
+
+		const rec_t* rec = btr_cur_get_rec(cursor);
+
+		if (!page_rec_is_user_rec(rec)) {
+			goto func_exit;
+		}
+
+		ulint fold = rec_fold(
+			rec,
+			rec_get_offsets(rec, index, offsets_,
+					index->n_core_fields,
+					ULINT_UNDEFINED, &heap),
+			block->curr_n_fields,
+			block->curr_n_bytes, index->id);
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+
+		ha_insert_for_fold(&part->table, part->heap, fold, block, rec);
+
+		MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
+	}
+
+func_exit:
+	rw_lock_x_unlock(&part->latch);
+}
+
+/** Checks if a guessed position for a tree cursor is right. Note that if
+mode is PAGE_CUR_LE, which is used in inserts, and the function returns
+TRUE, then cursor->up_match and cursor->low_match both have sensible values.
+@param[in,out]	cursor		guess cursor position
+@param[in]	can_only_compare_to_cursor_rec
+				if we do not have a latch on the page of cursor,
+				but a latch corresponding search system, then
+				ONLY the columns of the record UNDER the cursor
+				are protected, not the next or previous record
+				in the chain: we cannot look at the next or
+				previous record to check our guess!
+@param[in]	tuple		data tuple
+@param[in]	mode		PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, PAGE_CUR_GE
+@return	whether a match was found */
+static
+bool
+btr_search_check_guess(
+	btr_cur_t*	cursor,
+	bool		can_only_compare_to_cursor_rec,
+	const dtuple_t*	tuple,
+	ulint		mode)
+{
+	rec_t*		rec;
+	ulint		n_unique;
+	ulint		match;
+	int		cmp;
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	ibool		success		= FALSE;
+	rec_offs_init(offsets_);
+
+	n_unique = dict_index_get_n_unique_in_tree(cursor->index);
+
+	rec = btr_cur_get_rec(cursor);
+
+	ut_ad(page_rec_is_user_rec(rec));
+	ut_ad(page_rec_is_leaf(rec));
+
+	match = 0;
+
+	offsets = rec_get_offsets(rec, cursor->index, offsets,
+				  cursor->index->n_core_fields,
+				  n_unique, &heap);
+	cmp = cmp_dtuple_rec_with_match(tuple, rec, offsets, &match);
+
+	if (mode == PAGE_CUR_GE) {
+		if (cmp > 0) {
+			goto exit_func;
+		}
+
+		cursor->up_match = match;
+
+		if (match >= n_unique) {
+			success = TRUE;
+			goto exit_func;
+		}
+	} else if (mode == PAGE_CUR_LE) {
+		if (cmp < 0) {
+			goto exit_func;
+		}
+
+		cursor->low_match = match;
+
+	} else if (mode == PAGE_CUR_G) {
+		if (cmp >= 0) {
+			goto exit_func;
+		}
+	} else if (mode == PAGE_CUR_L) {
+		if (cmp <= 0) {
+			goto exit_func;
+		}
+	}
+
+	if (can_only_compare_to_cursor_rec) {
+		/* Since we could not determine if our guess is right just by
+		looking at the record under the cursor, return FALSE */
+		goto exit_func;
+	}
+
+	match = 0;
+
+	if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE)) {
+		ut_ad(!page_rec_is_infimum(rec));
+
+		const rec_t* prev_rec = page_rec_get_prev(rec);
+
+		if (page_rec_is_infimum(prev_rec)) {
+			success = !page_has_prev(page_align(prev_rec));
+			goto exit_func;
+		}
+
+		offsets = rec_get_offsets(prev_rec, cursor->index, offsets,
+					  cursor->index->n_core_fields,
+					  n_unique, &heap);
+		cmp = cmp_dtuple_rec_with_match(
+			tuple, prev_rec, offsets, &match);
+		if (mode == PAGE_CUR_GE) {
+			success = cmp > 0;
+		} else {
+			success = cmp >= 0;
+		}
+	} else {
+		ut_ad(!page_rec_is_supremum(rec));
+
+		const rec_t* next_rec = page_rec_get_next(rec);
+
+		if (page_rec_is_supremum(next_rec)) {
+			if (!page_has_next(page_align(next_rec))) {
+				cursor->up_match = 0;
+				success = TRUE;
+			}
+
+			goto exit_func;
+		}
+
+		offsets = rec_get_offsets(next_rec, cursor->index, offsets,
+					  cursor->index->n_core_fields,
+					  n_unique, &heap);
+		cmp = cmp_dtuple_rec_with_match(
+			tuple, next_rec, offsets, &match);
+		if (mode == PAGE_CUR_LE) {
+			success = cmp < 0;
+			cursor->up_match = match;
+		} else {
+			success = cmp <= 0;
+		}
+	}
+exit_func:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(success);
+}
+
+static
+void
+btr_search_failure(btr_search_t* info, btr_cur_t* cursor)
+{
+	cursor->flag = BTR_CUR_HASH_FAIL;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+	++info->n_hash_fail;
+
+	if (info->n_hash_succ > 0) {
+		--info->n_hash_succ;
+	}
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+	info->last_hash_succ = FALSE;
+}
+
+/** Clear the adaptive hash index on all pages in the buffer pool. */
+inline void buf_pool_t::clear_hash_index()
+{
+  ut_ad(btr_search_own_all(RW_LOCK_X));
+  ut_ad(!resizing);
+  ut_ad(!btr_search_enabled);
+
+  std::set<dict_index_t*> garbage;
+
+  for (chunk_t *chunk= chunks + n_chunks; chunk-- != chunks; )
+  {
+    for (buf_block_t *block= chunk->blocks, * const end= block + chunk->size;
+         block != end; block++)
+    {
+      dict_index_t *index= block->index;
+      assert_block_ahi_valid(block);
+
+      /* We can clear block->index and block->n_pointers when
+      btr_search_own_all(RW_LOCK_X); see the comments in buf0buf.h */
+
+      if (!index)
+      {
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+        ut_a(!block->n_pointers);
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+        continue;
+      }
+
+      ut_d(buf_page_state state= block->page.state());
+      /* Another thread may have set the state to
+      BUF_BLOCK_REMOVE_HASH in buf_LRU_block_remove_hashed().
+
+      The state change in buf_pool_t::realloc() is not observable
+      here, because in that case we would have !block->index.
+
+      In the end, the entire adaptive hash index will be removed. */
+      ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_REMOVE_HASH);
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+      block->n_pointers= 0;
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+      if (index->freed())
+        garbage.insert(index);
+      block->index= nullptr;
+    }
+  }
+
+  for (dict_index_t *index : garbage)
+    btr_search_lazy_free(index);
+}
+
+/** Get a buffer block from an adaptive hash index pointer.
+This function does not return if the block is not identified.
+@param ptr  pointer to within a page frame
+@return pointer to block, never NULL */
+inline buf_block_t* buf_pool_t::block_from_ahi(const byte *ptr) const
+{
+  chunk_t::map *chunk_map = chunk_t::map_ref;
+  ut_ad(chunk_t::map_ref == chunk_t::map_reg);
+  ut_ad(!resizing);
+
+  chunk_t::map::const_iterator it= chunk_map->upper_bound(ptr);
+  ut_a(it != chunk_map->begin());
+
+  chunk_t *chunk= it == chunk_map->end()
+    ? chunk_map->rbegin()->second
+    : (--it)->second;
+
+  const size_t offs= size_t(ptr - chunk->blocks->frame) >> srv_page_size_shift;
+  ut_a(offs < chunk->size);
+
+  buf_block_t *block= &chunk->blocks[offs];
+  /* buf_pool_t::chunk_t::init() invokes buf_block_init() so that
+  block[n].frame == block->frame + n * srv_page_size.  Check it. */
+  ut_ad(block->frame == page_align(ptr));
+  /* Read the state of the block without holding hash_lock.
+  A state transition from BUF_BLOCK_FILE_PAGE to
+  BUF_BLOCK_REMOVE_HASH is possible during this execution. */
+  ut_d(const buf_page_state state = block->page.state());
+  ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_REMOVE_HASH);
+  return block;
+}
+
+/** Tries to guess the right search position based on the hash search info
+of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts,
+and the function returns TRUE, then cursor->up_match and cursor->low_match
+both have sensible values.
+@param[in,out]	index		index
+@param[in,out]	info		index search info
+@param[in]	tuple		logical record
+@param[in]	mode		PAGE_CUR_L, ....
+@param[in]	latch_mode	BTR_SEARCH_LEAF, ...;
+				NOTE that only if has_search_latch is 0, we will
+				have a latch set on the cursor page, otherwise
+				we assume the caller uses his search latch
+				to protect the record!
+@param[out]	cursor		tree cursor
+@param[in]	ahi_latch	the adaptive hash index latch being held,
+				or NULL
+@param[in]	mtr		mini transaction
+@return whether the search succeeded */
+bool
+btr_search_guess_on_hash(
+	dict_index_t*	index,
+	btr_search_t*	info,
+	const dtuple_t*	tuple,
+	ulint		mode,
+	ulint		latch_mode,
+	btr_cur_t*	cursor,
+	rw_lock_t*	ahi_latch,
+	mtr_t*		mtr)
+{
+	ulint		fold;
+	index_id_t	index_id;
+
+	ut_ad(mtr->is_active());
+	ut_ad(!ahi_latch || rw_lock_own_flagged(
+		      ahi_latch, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
+
+	if (!btr_search_enabled) {
+		return false;
+	}
+
+	ut_ad(!index->is_ibuf());
+	ut_ad(!ahi_latch
+	      || ahi_latch == &btr_search_sys.get_part(*index)->latch);
+	ut_ad((latch_mode == BTR_SEARCH_LEAF)
+	      || (latch_mode == BTR_MODIFY_LEAF));
+	compile_time_assert(ulint{BTR_SEARCH_LEAF} == ulint{RW_S_LATCH});
+	compile_time_assert(ulint{BTR_MODIFY_LEAF} == ulint{RW_X_LATCH});
+
+	/* Not supported for spatial index */
+	ut_ad(!dict_index_is_spatial(index));
+
+	/* Note that, for efficiency, the struct info may not be protected by
+	any latch here! */
+
+	if (info->n_hash_potential == 0) {
+		return false;
+	}
+
+	cursor->n_fields = info->n_fields;
+	cursor->n_bytes = info->n_bytes;
+
+	if (dtuple_get_n_fields(tuple) < btr_search_get_n_fields(cursor)) {
+		return false;
+	}
+
+	index_id = index->id;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+	info->n_hash_succ++;
+#endif
+	fold = dtuple_fold(tuple, cursor->n_fields, cursor->n_bytes, index_id);
+
+	cursor->fold = fold;
+	cursor->flag = BTR_CUR_HASH;
+
+	auto part = btr_search_sys.get_part(*index);
+	const rec_t* rec;
+
+	if (!ahi_latch) {
+		rw_lock_s_lock(&part->latch);
+
+		if (!btr_search_enabled) {
+			goto fail;
+		}
+	} else {
+		ut_ad(btr_search_enabled);
+		ut_ad(rw_lock_own(ahi_latch, RW_LOCK_S));
+	}
+
+	rec = static_cast<const rec_t*>(
+		ha_search_and_get_data(&part->table, fold));
+
+	if (!rec) {
+		if (!ahi_latch) {
+fail:
+			rw_lock_s_unlock(&part->latch);
+		}
+
+		btr_search_failure(info, cursor);
+		return false;
+	}
+
+	buf_block_t* block = buf_pool.block_from_ahi(rec);
+
+	if (!ahi_latch) {
+		page_hash_latch* hash_lock = buf_pool.hash_lock_get(
+			block->page.id());
+		hash_lock->read_lock();
+
+		if (block->page.state() == BUF_BLOCK_REMOVE_HASH) {
+			/* Another thread is just freeing the block
+			from the LRU list of the buffer pool: do not
+			try to access this page. */
+			hash_lock->read_unlock();
+			goto fail;
+		}
+
+		const bool fail = index != block->index
+			&& index_id == block->index->id;
+		ut_a(!fail || block->index->freed());
+		ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+		DBUG_ASSERT(fail || block->page.status != buf_page_t::FREED);
+
+		buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+		hash_lock->read_unlock();
+		block->page.set_accessed();
+
+		buf_page_make_young_if_needed(&block->page);
+		mtr_memo_type_t	fix_type;
+		if (latch_mode == BTR_SEARCH_LEAF) {
+			if (!rw_lock_s_lock_nowait(&block->lock,
+						   __FILE__, __LINE__)) {
+got_no_latch:
+				buf_block_buf_fix_dec(block);
+				goto fail;
+			}
+			fix_type = MTR_MEMO_PAGE_S_FIX;
+		} else {
+			if (!rw_lock_x_lock_func_nowait_inline(
+				    &block->lock, __FILE__, __LINE__)) {
+				goto got_no_latch;
+			}
+			fix_type = MTR_MEMO_PAGE_X_FIX;
+		}
+		mtr->memo_push(block, fix_type);
+
+		buf_pool.stat.n_page_gets++;
+
+		rw_lock_s_unlock(&part->latch);
+
+		buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH);
+		if (UNIV_UNLIKELY(fail)) {
+			goto fail_and_release_page;
+		}
+	} else if (UNIV_UNLIKELY(index != block->index
+				 && index_id == block->index->id)) {
+		ut_a(block->index->freed());
+		goto fail_and_release_page;
+	}
+
+	if (block->page.state() != BUF_BLOCK_FILE_PAGE) {
+
+		ut_ad(block->page.state() == BUF_BLOCK_REMOVE_HASH);
+
+fail_and_release_page:
+		if (!ahi_latch) {
+			btr_leaf_page_release(block, latch_mode, mtr);
+		}
+
+		btr_search_failure(info, cursor);
+		return false;
+	}
+
+	ut_ad(page_rec_is_user_rec(rec));
+
+	btr_cur_position(index, (rec_t*) rec, block, cursor);
+
+	/* Check the validity of the guess within the page */
+
+	/* If we only have the latch on search system, not on the
+	page, it only protects the columns of the record the cursor
+	is positioned on. We cannot look at the next of the previous
+	record to determine if our guess for the cursor position is
+	right. */
+	if (index_id != btr_page_get_index_id(block->frame)
+	    || !btr_search_check_guess(cursor, !!ahi_latch, tuple, mode)) {
+		goto fail_and_release_page;
+	}
+
+	if (info->n_hash_potential < BTR_SEARCH_BUILD_LIMIT + 5) {
+
+		info->n_hash_potential++;
+	}
+
+#ifdef notdefined
+	/* These lines of code can be used in a debug version to check
+	the correctness of the searched cursor position: */
+
+	info->last_hash_succ = FALSE;
+
+	/* Currently, does not work if the following fails: */
+	ut_ad(!ahi_latch);
+
+	btr_leaf_page_release(block, latch_mode, mtr);
+
+	btr_cur_search_to_nth_level(
+		index, 0, tuple, mode, latch_mode, &cursor2, 0, mtr);
+
+	if (mode == PAGE_CUR_GE
+	    && page_rec_is_supremum(btr_cur_get_rec(&cursor2))) {
+
+		/* If mode is PAGE_CUR_GE, then the binary search
+		in the index tree may actually take us to the supremum
+		of the previous page */
+
+		info->last_hash_succ = FALSE;
+
+		btr_pcur_open_on_user_rec(
+			index, tuple, mode, latch_mode, &pcur, mtr);
+
+		ut_ad(btr_pcur_get_rec(&pcur) == btr_cur_get_rec(cursor));
+	} else {
+		ut_ad(btr_cur_get_rec(&cursor2) == btr_cur_get_rec(cursor));
+	}
+
+	/* NOTE that it is theoretically possible that the above assertions
+	fail if the page of the cursor gets removed from the buffer pool
+	meanwhile! Thus it might not be a bug. */
+#endif
+	info->last_hash_succ = TRUE;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+	btr_search_n_succ++;
+#endif
+	/* Increment the page get statistics though we did not really
+	fix the page: for user info only */
+	++buf_pool.stat.n_page_gets;
+
+	if (!ahi_latch) {
+		buf_page_make_young_if_needed(&block->page);
+	}
+
+	return true;
+}
+
+/** Drop any adaptive hash index entries that point to an index page.
+@param[in,out]	block	block containing index page, s- or x-latched, or an
+			index page for which we know that
+			block->buf_fix_count == 0 or it is an index page which
+			has already been removed from the buf_pool.page_hash
+			i.e.: it is in state BUF_BLOCK_REMOVE_HASH */
+void btr_search_drop_page_hash_index(buf_block_t* block)
+{
+	ulint			n_fields;
+	ulint			n_bytes;
+	const page_t*		page;
+	const rec_t*		rec;
+	ulint			fold;
+	ulint			prev_fold;
+	ulint			n_cached;
+	ulint			n_recs;
+	ulint*			folds;
+	ulint			i;
+	mem_heap_t*		heap;
+	rec_offs*		offsets;
+
+retry:
+	/* This debug check uses a dirty read that could theoretically cause
+	false positives while buf_pool.clear_hash_index() is executing. */
+	assert_block_ahi_valid(block);
+	ut_ad(!btr_search_own_any(RW_LOCK_S));
+	ut_ad(!btr_search_own_any(RW_LOCK_X));
+
+	if (!block->index) {
+		return;
+	}
+
+	ut_ad(!block->page.buf_fix_count()
+	      || block->page.state() == BUF_BLOCK_REMOVE_HASH
+	      || rw_lock_own_flagged(&block->lock,
+				     RW_LOCK_FLAG_X | RW_LOCK_FLAG_S
+				     | RW_LOCK_FLAG_SX));
+	ut_ad(page_is_leaf(block->frame));
+
+	/* We must not dereference block->index here, because it could be freed
+	if (index->table->n_ref_count == 0 && !mutex_own(&dict_sys.mutex)).
+	Determine the ahi_slot based on the block contents. */
+
+	const index_id_t	index_id
+		= btr_page_get_index_id(block->frame);
+
+	auto part = btr_search_sys.get_part(index_id,
+					    block->page.id().space());
+
+	dict_index_t* index = block->index;
+	bool is_freed = index && index->freed();
+
+	if (is_freed) {
+		rw_lock_x_lock(&part->latch);
+	} else {
+		rw_lock_s_lock(&part->latch);
+	}
+
+	assert_block_ahi_valid(block);
+
+
+	if (!index || !btr_search_enabled) {
+		if (is_freed) {
+			rw_lock_x_unlock(&part->latch);
+		} else {
+			rw_lock_s_unlock(&part->latch);
+		}
+		return;
+	}
+
+#ifdef MYSQL_INDEX_DISABLE_AHI
+	ut_ad(!index->disable_ahi);
+#endif
+	ut_ad(btr_search_enabled);
+
+	ut_ad(block->page.id().space() == index->table->space_id);
+	ut_a(index_id == index->id);
+	ut_ad(!dict_index_is_ibuf(index));
+
+	n_fields = block->curr_n_fields;
+	n_bytes = block->curr_n_bytes;
+
+	/* NOTE: The AHI fields of block must not be accessed after
+	releasing search latch, as the index page might only be s-latched! */
+
+	if (!is_freed) {
+		rw_lock_s_unlock(&part->latch);
+	}
+
+	ut_a(n_fields > 0 || n_bytes > 0);
+
+	page = block->frame;
+	n_recs = page_get_n_recs(page);
+
+	/* Calculate and cache fold values into an array for fast deletion
+	from the hash index */
+
+	folds = (ulint*) ut_malloc_nokey(n_recs * sizeof(ulint));
+
+	n_cached = 0;
+
+	rec = page_get_infimum_rec(page);
+	rec = page_rec_get_next_low(rec, page_is_comp(page));
+	if (rec_is_metadata(rec, *index)) {
+		rec = page_rec_get_next_low(rec, page_is_comp(page));
+	}
+
+	prev_fold = 0;
+
+	heap = NULL;
+	offsets = NULL;
+
+	while (!page_rec_is_supremum(rec)) {
+		offsets = rec_get_offsets(
+			rec, index, offsets, index->n_core_fields,
+			btr_search_get_n_fields(n_fields, n_bytes),
+			&heap);
+		fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id);
+
+		if (fold == prev_fold && prev_fold != 0) {
+
+			goto next_rec;
+		}
+
+		/* Remove all hash nodes pointing to this page from the
+		hash chain */
+
+		folds[n_cached] = fold;
+		n_cached++;
+next_rec:
+		rec = page_rec_get_next_low(rec, page_rec_is_comp(rec));
+		prev_fold = fold;
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	if (!is_freed) {
+		rw_lock_x_lock(&part->latch);
+
+		if (UNIV_UNLIKELY(!block->index)) {
+			/* Someone else has meanwhile dropped the
+			hash index */
+			goto cleanup;
+		}
+
+		ut_a(block->index == index);
+	}
+
+	if (block->curr_n_fields != n_fields
+	    || block->curr_n_bytes != n_bytes) {
+
+		/* Someone else has meanwhile built a new hash index on the
+		page, with different parameters */
+
+		rw_lock_x_unlock(&part->latch);
+
+		ut_free(folds);
+		goto retry;
+	}
+
+	for (i = 0; i < n_cached; i++) {
+		ha_remove_all_nodes_to_page(&part->table, part->heap,
+					    folds[i], page);
+	}
+
+	switch (index->search_info->ref_count--) {
+	case 0:
+		ut_error;
+	case 1:
+		if (index->freed()) {
+			btr_search_lazy_free(index);
+		}
+	}
+
+	block->index = NULL;
+
+	MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_REMOVED);
+	MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_REMOVED, n_cached);
+
+cleanup:
+	assert_block_ahi_valid(block);
+	rw_lock_x_unlock(&part->latch);
+
+	ut_free(folds);
+}
+
+/** Drop possible adaptive hash index entries when a page is evicted
+from the buffer pool or freed in a file, or the index is being dropped.
+@param[in]	page_id		page id */
+void btr_search_drop_page_hash_when_freed(const page_id_t page_id)
+{
+	buf_block_t*	block;
+	mtr_t		mtr;
+	dberr_t		err = DB_SUCCESS;
+
+	mtr_start(&mtr);
+
+	/* If the caller has a latch on the page, then the caller must
+	have a x-latch on the page and it must have already dropped
+	the hash index for the page. Because of the x-latch that we
+	are possibly holding, we cannot s-latch the page, but must
+	(recursively) x-latch it, even though we are only reading. */
+
+	block = buf_page_get_gen(page_id, 0, RW_X_LATCH, NULL,
+				 BUF_PEEK_IF_IN_POOL, __FILE__, __LINE__,
+				 &mtr, &err);
+
+	if (block) {
+
+		/* If AHI is still valid, page can't be in free state.
+		AHI is dropped when page is freed. */
+		DBUG_ASSERT(block->page.status != buf_page_t::FREED);
+
+		buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH);
+
+		dict_index_t*	index = block->index;
+		if (index != NULL) {
+			/* In all our callers, the table handle should
+			be open, or we should be in the process of
+			dropping the table (preventing eviction). */
+			ut_ad(index->table->get_ref_count() > 0
+			      || mutex_own(&dict_sys.mutex));
+			btr_search_drop_page_hash_index(block);
+		}
+	}
+
+	mtr_commit(&mtr);
+}
+
+/** Build a hash index on a page with the given parameters. If the page already
+has a hash index with different parameters, the old hash index is removed.
+If index is non-NULL, this function checks if n_fields and n_bytes are
+sensible, and does not build a hash index if not.
+@param[in,out]	index		index for which to build.
+@param[in,out]	block		index page, s-/x- latched.
+@param[in,out]	ahi_latch	the adaptive search latch
+@param[in]	n_fields	hash this many full fields
+@param[in]	n_bytes		hash this many bytes of the next field
+@param[in]	left_side	hash for searches from left side */
+static
+void
+btr_search_build_page_hash_index(
+	dict_index_t*	index,
+	buf_block_t*	block,
+	rw_lock_t*	ahi_latch,
+	uint16_t	n_fields,
+	uint16_t	n_bytes,
+	bool		left_side)
+{
+	const rec_t*	rec;
+	const rec_t*	next_rec;
+	ulint		fold;
+	ulint		next_fold;
+	ulint		n_cached;
+	ulint		n_recs;
+	ulint*		folds;
+	const rec_t**	recs;
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+
+#ifdef MYSQL_INDEX_DISABLE_AHI
+	if (index->disable_ahi) return;
+#endif
+	if (!btr_search_enabled) {
+		return;
+	}
+
+	rec_offs_init(offsets_);
+	ut_ad(ahi_latch == &btr_search_sys.get_part(*index)->latch);
+	ut_ad(index);
+	ut_ad(block->page.id().space() == index->table->space_id);
+	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad(page_is_leaf(block->frame));
+
+	ut_ad(rw_lock_own_flagged(&block->lock,
+				  RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
+	ut_ad(block->page.id().page_no() >= 3);
+
+	rw_lock_s_lock(ahi_latch);
+
+	const bool enabled = btr_search_enabled;
+	const bool rebuild = enabled && block->index
+		&& (block->curr_n_fields != n_fields
+		    || block->curr_n_bytes != n_bytes
+		    || block->curr_left_side != left_side);
+
+	rw_lock_s_unlock(ahi_latch);
+
+	if (!enabled) {
+		return;
+	}
+
+	if (rebuild) {
+		btr_search_drop_page_hash_index(block);
+	}
+
+	/* Check that the values for hash index build are sensible */
+
+	if (n_fields == 0 && n_bytes == 0) {
+
+		return;
+	}
+
+	if (dict_index_get_n_unique_in_tree(index)
+	    < btr_search_get_n_fields(n_fields, n_bytes)) {
+		return;
+	}
+
+	page_t*		page	= buf_block_get_frame(block);
+	n_recs = page_get_n_recs(page);
+
+	if (n_recs == 0) {
+
+		return;
+	}
+
+	rec = page_rec_get_next_const(page_get_infimum_rec(page));
+
+	if (rec_is_metadata(rec, *index)) {
+		rec = page_rec_get_next_const(rec);
+		if (!--n_recs) return;
+	}
+
+	/* Calculate and cache fold values and corresponding records into
+	an array for fast insertion to the hash index */
+
+	folds = static_cast<ulint*>(ut_malloc_nokey(n_recs * sizeof *folds));
+	recs = static_cast<const rec_t**>(
+		ut_malloc_nokey(n_recs * sizeof *recs));
+
+	n_cached = 0;
+
+	ut_a(index->id == btr_page_get_index_id(page));
+
+	offsets = rec_get_offsets(
+		rec, index, offsets, index->n_core_fields,
+		btr_search_get_n_fields(n_fields, n_bytes),
+		&heap);
+	ut_ad(page_rec_is_supremum(rec)
+	      || n_fields == rec_offs_n_fields(offsets) - (n_bytes > 0));
+
+	fold = rec_fold(rec, offsets, n_fields, n_bytes, index->id);
+
+	if (left_side) {
+
+		folds[n_cached] = fold;
+		recs[n_cached] = rec;
+		n_cached++;
+	}
+
+	for (;;) {
+		next_rec = page_rec_get_next_const(rec);
+
+		if (page_rec_is_supremum(next_rec)) {
+
+			if (!left_side) {
+
+				folds[n_cached] = fold;
+				recs[n_cached] = rec;
+				n_cached++;
+			}
+
+			break;
+		}
+
+		offsets = rec_get_offsets(
+			next_rec, index, offsets, index->n_core_fields,
+			btr_search_get_n_fields(n_fields, n_bytes), &heap);
+		next_fold = rec_fold(next_rec, offsets, n_fields,
+				     n_bytes, index->id);
+
+		if (fold != next_fold) {
+			/* Insert an entry into the hash index */
+
+			if (left_side) {
+
+				folds[n_cached] = next_fold;
+				recs[n_cached] = next_rec;
+				n_cached++;
+			} else {
+				folds[n_cached] = fold;
+				recs[n_cached] = rec;
+				n_cached++;
+			}
+		}
+
+		rec = next_rec;
+		fold = next_fold;
+	}
+
+	btr_search_check_free_space_in_heap(index);
+
+	rw_lock_x_lock(ahi_latch);
+
+	if (!btr_search_enabled) {
+		goto exit_func;
+	}
+
+	/* This counter is decremented every time we drop page
+	hash index entries and is incremented here. Since we can
+	rebuild hash index for a page that is already hashed, we
+	have to take care not to increment the counter in that
+	case. */
+	if (!block->index) {
+		assert_block_ahi_empty(block);
+		index->search_info->ref_count++;
+	} else if (block->curr_n_fields != n_fields
+		   || block->curr_n_bytes != n_bytes
+		   || block->curr_left_side != left_side) {
+		goto exit_func;
+	}
+
+	block->n_hash_helps = 0;
+
+	block->curr_n_fields = n_fields & dict_index_t::MAX_N_FIELDS;
+	block->curr_n_bytes = n_bytes & ((1U << 15) - 1);
+	block->curr_left_side = left_side;
+	block->index = index;
+
+	{
+		auto part = btr_search_sys.get_part(*index);
+		for (ulint i = 0; i < n_cached; i++) {
+			ha_insert_for_fold(&part->table, part->heap,
+					   folds[i], block, recs[i]);
+		}
+	}
+
+	MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_ADDED);
+	MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_ADDED, n_cached);
+exit_func:
+	assert_block_ahi_valid(block);
+	rw_lock_x_unlock(ahi_latch);
+
+	ut_free(folds);
+	ut_free(recs);
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/** Updates the search info.
+@param[in,out]	info	search info
+@param[in,out]	cursor	cursor which was just positioned */
+void
+btr_search_info_update_slow(btr_search_t* info, btr_cur_t* cursor)
+{
+	rw_lock_t*	ahi_latch = &btr_search_sys.get_part(*cursor->index)
+		->latch;
+	ut_ad(!rw_lock_own_flagged(ahi_latch,
+				   RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
+
+	buf_block_t*	block = btr_cur_get_block(cursor);
+
+	/* NOTE that the following two function calls do NOT protect
+	info or block->n_fields etc. with any semaphore, to save CPU time!
+	We cannot assume the fields are consistent when we return from
+	those functions! */
+
+	btr_search_info_update_hash(info, cursor);
+
+	bool build_index = btr_search_update_block_hash_info(info, block);
+
+	if (build_index || (cursor->flag == BTR_CUR_HASH_FAIL)) {
+
+		btr_search_check_free_space_in_heap(cursor->index);
+	}
+
+	if (cursor->flag == BTR_CUR_HASH_FAIL) {
+		/* Update the hash node reference, if appropriate */
+
+#ifdef UNIV_SEARCH_PERF_STAT
+		btr_search_n_hash_fail++;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+		btr_search_update_hash_ref(info, block, cursor);
+	}
+
+	if (build_index) {
+		/* Note that since we did not protect block->n_fields etc.
+		with any semaphore, the values can be inconsistent. We have
+		to check inside the function call that they make sense. */
+		btr_search_build_page_hash_index(cursor->index, block,
+						 ahi_latch,
+						 block->n_fields,
+						 block->n_bytes,
+						 block->left_side);
+	}
+}
+
+/** Move or delete hash entries for moved records, usually in a page split.
+If new_block is already hashed, then any hash index for block is dropped.
+If new_block is not hashed, and block is hashed, then a new hash index is
+built to new_block with the same parameters as block.
+@param[in,out]	new_block	destination page
+@param[in,out]	block		source page (subject to deletion later) */
+void
+btr_search_move_or_delete_hash_entries(
+	buf_block_t*	new_block,
+	buf_block_t*	block)
+{
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_X));
+	ut_ad(rw_lock_own(&(new_block->lock), RW_LOCK_X));
+
+	if (!btr_search_enabled) {
+		return;
+	}
+
+	dict_index_t* index = block->index;
+	if (!index) {
+		index = new_block->index;
+	} else {
+		ut_ad(!new_block->index || index == new_block->index);
+	}
+	assert_block_ahi_valid(block);
+	assert_block_ahi_valid(new_block);
+
+	rw_lock_t* ahi_latch = index
+		? &btr_search_sys.get_part(*index)->latch
+		: nullptr;
+
+	if (new_block->index) {
+drop_exit:
+		btr_search_drop_page_hash_index(block);
+		return;
+	}
+
+	if (!index) {
+		return;
+	}
+
+	if (index->freed()) {
+		goto drop_exit;
+	}
+
+	rw_lock_s_lock(ahi_latch);
+
+	if (block->index) {
+		uint16_t n_fields = block->curr_n_fields;
+		uint16_t n_bytes = block->curr_n_bytes;
+		bool left_side = block->curr_left_side;
+
+		new_block->n_fields = block->curr_n_fields;
+		new_block->n_bytes = block->curr_n_bytes;
+		new_block->left_side = left_side;
+
+		rw_lock_s_unlock(ahi_latch);
+
+		ut_a(n_fields > 0 || n_bytes > 0);
+
+		btr_search_build_page_hash_index(
+			index, new_block, ahi_latch,
+			n_fields, n_bytes, left_side);
+		ut_ad(n_fields == block->curr_n_fields);
+		ut_ad(n_bytes == block->curr_n_bytes);
+		ut_ad(left_side == block->curr_left_side);
+		return;
+	}
+
+	rw_lock_s_unlock(ahi_latch);
+}
+
+/** Updates the page hash index when a single record is deleted from a page.
+@param[in]	cursor	cursor which was positioned on the record to delete
+			using btr_cur_search_, the record is not yet deleted.*/
+void btr_search_update_hash_on_delete(btr_cur_t* cursor)
+{
+	buf_block_t*	block;
+	const rec_t*	rec;
+	ulint		fold;
+	dict_index_t*	index;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	mem_heap_t*	heap		= NULL;
+	rec_offs_init(offsets_);
+
+	ut_ad(page_is_leaf(btr_cur_get_page(cursor)));
+#ifdef MYSQL_INDEX_DISABLE_AHI
+	if (cursor->index->disable_ahi) return;
+#endif
+
+	if (!btr_search_enabled) {
+		return;
+	}
+
+	block = btr_cur_get_block(cursor);
+
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_X));
+
+	assert_block_ahi_valid(block);
+	index = block->index;
+
+	if (!index) {
+
+		return;
+	}
+
+	if (index != cursor->index) {
+		btr_search_drop_page_hash_index(block);
+		return;
+	}
+
+	ut_ad(block->page.id().space() == index->table->space_id);
+	ut_a(index == cursor->index);
+	ut_a(block->curr_n_fields > 0 || block->curr_n_bytes > 0);
+	ut_ad(!dict_index_is_ibuf(index));
+
+	rec = btr_cur_get_rec(cursor);
+
+	fold = rec_fold(rec, rec_get_offsets(rec, index, offsets_,
+					     index->n_core_fields,
+					     ULINT_UNDEFINED, &heap),
+			block->curr_n_fields, block->curr_n_bytes, index->id);
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	auto part = btr_search_sys.get_part(*index);
+
+	rw_lock_x_lock(&part->latch);
+	assert_block_ahi_valid(block);
+
+	if (block->index && btr_search_enabled) {
+		ut_a(block->index == index);
+
+		if (ha_search_and_delete_if_found(&part->table, part->heap,
+						  fold, rec)) {
+			MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVED);
+		} else {
+			MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND);
+		}
+
+		assert_block_ahi_valid(block);
+	}
+
+	rw_lock_x_unlock(&part->latch);
+}
+
+/** Updates the page hash index when a single record is inserted on a page.
+@param[in]	cursor	cursor which was positioned to the place to insert
+			using btr_cur_search_, and the new record has been
+			inserted next to the cursor.
+@param[in]	ahi_latch	the adaptive hash index latch */
+void
+btr_search_update_hash_node_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
+{
+	buf_block_t*	block;
+	dict_index_t*	index;
+	rec_t*		rec;
+
+	ut_ad(ahi_latch == &btr_search_sys.get_part(*cursor->index)->latch);
+	ut_ad(!btr_search_own_any(RW_LOCK_S));
+	ut_ad(!btr_search_own_any(RW_LOCK_X));
+#ifdef MYSQL_INDEX_DISABLE_AHI
+	if (cursor->index->disable_ahi) return;
+#endif
+	if (!btr_search_enabled) {
+		return;
+	}
+
+	rec = btr_cur_get_rec(cursor);
+
+	block = btr_cur_get_block(cursor);
+
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_X));
+
+	index = block->index;
+
+	if (!index) {
+
+		return;
+	}
+
+	if (index != cursor->index) {
+		ut_ad(index->id == cursor->index->id);
+		btr_search_drop_page_hash_index(block);
+		return;
+	}
+
+	ut_a(cursor->index == index);
+	ut_ad(!dict_index_is_ibuf(index));
+	rw_lock_x_lock(ahi_latch);
+
+	if (!block->index || !btr_search_enabled) {
+
+		goto func_exit;
+	}
+
+	ut_a(block->index == index);
+
+	if ((cursor->flag == BTR_CUR_HASH)
+	    && (cursor->n_fields == block->curr_n_fields)
+	    && (cursor->n_bytes == block->curr_n_bytes)
+	    && !block->curr_left_side) {
+
+		if (ha_search_and_update_if_found(
+			&btr_search_sys.get_part(*cursor->index)->table,
+			cursor->fold, rec, block,
+			page_rec_get_next(rec))) {
+			MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_UPDATED);
+		}
+
+func_exit:
+		assert_block_ahi_valid(block);
+		rw_lock_x_unlock(ahi_latch);
+	} else {
+		rw_lock_x_unlock(ahi_latch);
+
+		btr_search_update_hash_on_insert(cursor, ahi_latch);
+	}
+}
+
+/** Updates the page hash index when a single record is inserted on a page.
+@param[in,out]	cursor		cursor which was positioned to the
+				place to insert using btr_cur_search_...,
+				and the new record has been inserted next
+				to the cursor
+@param[in]	ahi_latch	the adaptive hash index latch */
+void
+btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
+{
+	buf_block_t*	block;
+	dict_index_t*	index;
+	const rec_t*	rec;
+	const rec_t*	ins_rec;
+	const rec_t*	next_rec;
+	ulint		fold;
+	ulint		ins_fold;
+	ulint		next_fold = 0; /* remove warning (??? bug ???) */
+	ulint		n_fields;
+	ulint		n_bytes;
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(ahi_latch == &btr_search_sys.get_part(*cursor->index)->latch);
+	ut_ad(page_is_leaf(btr_cur_get_page(cursor)));
+	ut_ad(!btr_search_own_any(RW_LOCK_S));
+	ut_ad(!btr_search_own_any(RW_LOCK_X));
+#ifdef MYSQL_INDEX_DISABLE_AHI
+	if (cursor->index->disable_ahi) return;
+#endif
+	if (!btr_search_enabled) {
+		return;
+	}
+
+	block = btr_cur_get_block(cursor);
+
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_X));
+	assert_block_ahi_valid(block);
+
+	index = block->index;
+
+	if (!index) {
+
+		return;
+	}
+
+	ut_ad(block->page.id().space() == index->table->space_id);
+	btr_search_check_free_space_in_heap(index);
+
+	rec = btr_cur_get_rec(cursor);
+
+#ifdef MYSQL_INDEX_DISABLE_AHI
+	ut_a(!index->disable_ahi);
+#endif
+	if (index != cursor->index) {
+		ut_ad(index->id == cursor->index->id);
+		btr_search_drop_page_hash_index(block);
+		return;
+	}
+
+	ut_a(index == cursor->index);
+	ut_ad(!dict_index_is_ibuf(index));
+
+	n_fields = block->curr_n_fields;
+	n_bytes = block->curr_n_bytes;
+	const bool left_side = block->curr_left_side;
+
+	ins_rec = page_rec_get_next_const(rec);
+	next_rec = page_rec_get_next_const(ins_rec);
+
+	offsets = rec_get_offsets(ins_rec, index, offsets,
+				  index->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+	ins_fold = rec_fold(ins_rec, offsets, n_fields, n_bytes, index->id);
+
+	if (!page_rec_is_supremum(next_rec)) {
+		offsets = rec_get_offsets(
+			next_rec, index, offsets, index->n_core_fields,
+			btr_search_get_n_fields(n_fields, n_bytes), &heap);
+		next_fold = rec_fold(next_rec, offsets, n_fields,
+				     n_bytes, index->id);
+	}
+
+	/* We must not look up "part" before acquiring ahi_latch. */
+	btr_search_sys_t::partition* part= nullptr;
+	bool locked = false;
+
+	if (!page_rec_is_infimum(rec) && !rec_is_metadata(rec, *index)) {
+		offsets = rec_get_offsets(
+			rec, index, offsets, index->n_core_fields,
+			btr_search_get_n_fields(n_fields, n_bytes), &heap);
+		fold = rec_fold(rec, offsets, n_fields, n_bytes, index->id);
+	} else {
+		if (left_side) {
+			locked = true;
+			rw_lock_x_lock(ahi_latch);
+
+			if (!btr_search_enabled || !block->index) {
+				goto function_exit;
+			}
+
+			part = btr_search_sys.get_part(*index);
+			ha_insert_for_fold(&part->table, part->heap,
+					   ins_fold, block, ins_rec);
+			MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
+		}
+
+		goto check_next_rec;
+	}
+
+	if (fold != ins_fold) {
+
+		if (!locked) {
+			locked = true;
+			rw_lock_x_lock(ahi_latch);
+
+			if (!btr_search_enabled || !block->index) {
+				goto function_exit;
+			}
+
+			part = btr_search_sys.get_part(*index);
+		}
+
+		if (!left_side) {
+			ha_insert_for_fold(&part->table, part->heap,
+					   fold, block, rec);
+		} else {
+			ha_insert_for_fold(&part->table, part->heap,
+					   ins_fold, block, ins_rec);
+		}
+		MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
+	}
+
+check_next_rec:
+	if (page_rec_is_supremum(next_rec)) {
+
+		if (!left_side) {
+			if (!locked) {
+				locked = true;
+				rw_lock_x_lock(ahi_latch);
+
+				if (!btr_search_enabled || !block->index) {
+					goto function_exit;
+				}
+
+				part = btr_search_sys.get_part(*index);
+			}
+
+			ha_insert_for_fold(&part->table, part->heap,
+					   ins_fold, block, ins_rec);
+			MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
+		}
+
+		goto function_exit;
+	}
+
+	if (ins_fold != next_fold) {
+		if (!locked) {
+			locked = true;
+			rw_lock_x_lock(ahi_latch);
+
+			if (!btr_search_enabled || !block->index) {
+				goto function_exit;
+			}
+
+			part = btr_search_sys.get_part(*index);
+		}
+
+		if (!left_side) {
+			ha_insert_for_fold(&part->table, part->heap,
+					   ins_fold, block, ins_rec);
+		} else {
+			ha_insert_for_fold(&part->table, part->heap,
+					   next_fold, block, next_rec);
+		}
+		MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
+	}
+
+function_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	if (locked) {
+		rw_lock_x_unlock(ahi_latch);
+	}
+	ut_ad(!rw_lock_own(ahi_latch, RW_LOCK_X));
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+__attribute__((nonnull))
+/** @return whether a range of the cells is valid */
+static bool ha_validate(const hash_table_t *table,
+                        ulint start_index, ulint end_index)
+{
+  ut_a(start_index <= end_index);
+  ut_a(end_index < table->n_cells);
+
+  bool ok= true;
+
+  for (ulint i= start_index; i <= end_index; i++)
+  {
+    for (auto node= static_cast<const ha_node_t*>(table->array[i].node); node;
+         node= node->next)
+    {
+      if (table->calc_hash(node->fold) != i) {
+        ib::error() << "Hash table node fold value " << node->fold
+		    << " does not match the cell number " << i;
+	ok= false;
+      }
+    }
+  }
+
+  return ok;
+}
+
+/** Validates the search system for given hash table.
+@param[in]	hash_table_id	hash table to validate
+@return TRUE if ok */
+static
+ibool
+btr_search_hash_table_validate(ulint hash_table_id)
+{
+	ha_node_t*	node;
+	ibool		ok		= TRUE;
+	ulint		i;
+	ulint		cell_count;
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+
+	btr_search_x_lock_all();
+	if (!btr_search_enabled) {
+		btr_search_x_unlock_all();
+		return(TRUE);
+	}
+
+	/* How many cells to check before temporarily releasing
+	search latches. */
+	ulint		chunk_size = 10000;
+
+	rec_offs_init(offsets_);
+
+	mysql_mutex_lock(&buf_pool.mutex);
+
+	auto &part = btr_search_sys.parts[hash_table_id];
+
+	cell_count = part.table.n_cells;
+
+	for (i = 0; i < cell_count; i++) {
+		/* We release search latches every once in a while to
+		give other queries a chance to run. */
+		if ((i != 0) && ((i % chunk_size) == 0)) {
+
+			mysql_mutex_unlock(&buf_pool.mutex);
+			btr_search_x_unlock_all();
+
+			os_thread_yield();
+
+			btr_search_x_lock_all();
+
+			if (!btr_search_enabled) {
+				ok = true;
+				goto func_exit;
+			}
+
+			mysql_mutex_lock(&buf_pool.mutex);
+
+			ulint curr_cell_count = part.table.n_cells;
+
+			if (cell_count != curr_cell_count) {
+
+				cell_count = curr_cell_count;
+
+				if (i >= cell_count) {
+					break;
+				}
+			}
+		}
+
+		node = static_cast<ha_node_t*>(part.table.array[i].node);
+
+		for (; node != NULL; node = node->next) {
+			const buf_block_t*	block
+				= buf_pool.block_from_ahi((byte*) node->data);
+			index_id_t		page_index_id;
+
+			if (UNIV_LIKELY(block->page.state()
+					== BUF_BLOCK_FILE_PAGE)) {
+
+				/* The space and offset are only valid
+				for file blocks.  It is possible that
+				the block is being freed
+				(BUF_BLOCK_REMOVE_HASH, see the
+				assertion and the comment below) */
+				const page_id_t id(block->page.id());
+				if (const buf_page_t* hash_page
+				    = buf_pool.page_hash_get_low(
+					    id, id.fold())) {
+					ut_ad(hash_page == &block->page);
+					goto state_ok;
+				}
+			}
+
+			/* When a block is being freed,
+			buf_LRU_search_and_free_block() first removes
+			the block from buf_pool.page_hash by calling
+			buf_LRU_block_remove_hashed_page(). Then it
+			invokes btr_search_drop_page_hash_index(). */
+			ut_a(block->page.state() == BUF_BLOCK_REMOVE_HASH);
+state_ok:
+			ut_ad(!dict_index_is_ibuf(block->index));
+			ut_ad(block->page.id().space()
+			      == block->index->table->space_id);
+
+			page_index_id = btr_page_get_index_id(block->frame);
+
+			offsets = rec_get_offsets(
+				node->data, block->index, offsets,
+				block->index->n_core_fields,
+				btr_search_get_n_fields(block->curr_n_fields,
+							block->curr_n_bytes),
+				&heap);
+
+			const ulint	fold = rec_fold(
+				node->data, offsets,
+				block->curr_n_fields,
+				block->curr_n_bytes,
+				page_index_id);
+
+			if (node->fold != fold) {
+				const page_t*	page = block->frame;
+
+				ok = FALSE;
+
+				ib::error() << "Error in an adaptive hash"
+					<< " index pointer to page "
+					<< block->page.id()
+					<< ", ptr mem address "
+					<< reinterpret_cast<const void*>(
+						node->data)
+					<< ", index id " << page_index_id
+					<< ", node fold " << node->fold
+					<< ", rec fold " << fold;
+
+				fputs("InnoDB: Record ", stderr);
+				rec_print_new(stderr, node->data, offsets);
+				fprintf(stderr, "\nInnoDB: on that page."
+					" Page mem address %p, is hashed %p,"
+					" n fields %lu\n"
+					"InnoDB: side %lu\n",
+					(void*) page, (void*) block->index,
+					(ulong) block->curr_n_fields,
+					(ulong) block->curr_left_side);
+				ut_ad(0);
+			}
+		}
+	}
+
+	for (i = 0; i < cell_count; i += chunk_size) {
+		/* We release search latches every once in a while to
+		give other queries a chance to run. */
+		if (i != 0) {
+			mysql_mutex_unlock(&buf_pool.mutex);
+			btr_search_x_unlock_all();
+
+			os_thread_yield();
+
+			btr_search_x_lock_all();
+
+			if (!btr_search_enabled) {
+				ok = true;
+				goto func_exit;
+			}
+
+			mysql_mutex_lock(&buf_pool.mutex);
+
+			ulint curr_cell_count = part.table.n_cells;
+
+			if (cell_count != curr_cell_count) {
+
+				cell_count = curr_cell_count;
+
+				if (i >= cell_count) {
+					break;
+				}
+			}
+		}
+
+		ulint end_index = ut_min(i + chunk_size - 1, cell_count - 1);
+
+		if (!ha_validate(&part.table, i, end_index)) {
+			ok = FALSE;
+		}
+	}
+
+	mysql_mutex_unlock(&buf_pool.mutex);
+func_exit:
+	btr_search_x_unlock_all();
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return(ok);
+}
+
+/** Validate the search system.
+@return true if ok. */
+bool
+btr_search_validate()
+{
+	for (ulint i = 0; i < btr_ahi_parts; ++i) {
+		if (!btr_search_hash_table_validate(i)) {
+			return(false);
+		}
+	}
+
+	return(true);
+}
+
+#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
+#endif /* BTR_CUR_HASH_ADAPT */
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 18:07:14 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 18:07:14 +0000
commit	a175314c3e5827eb193872241446f2f8f5c9d33c (patch)
tree	cd3d60ca99ae00829c52a6ca79150a5b6e62528b /storage/innobase/btr
parent	Initial commit. (diff)
download	mariadb-10.5-a175314c3e5827eb193872241446f2f8f5c9d33c.tar.xz mariadb-10.5-a175314c3e5827eb193872241446f2f8f5c9d33c.zip