Adding upstream version 1:10.11.6.upstream/1%10.11.6 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 18:00:34 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 18:00:34 +0000
commit: 3f619478f796eddbba6e39502fe941b285dd97b1 (patch)
tree: e2c7b5777f728320e5b5542b6213fd3591ba51e2 /storage/innobase/row
parent: Initial commit. (diff)
download: mariadb-upstream.tar.xz
mariadb-upstream.zip
16 files changed, 40307 insertions, 0 deletions
diff --git a/storage/innobase/row/row0ext.cc b/storage/innobase/row/row0ext.cc
new file mode 100644
index 00000000..b7a62760
--- /dev/null
+++ b/storage/innobase/row/row0ext.cc
@@ -0,0 +1,132 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ext.cc
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#include "row0ext.h"
+#include "btr0cur.h"
+
+/** Fills the column prefix cache of an externally stored column.
+@param[in,out]	ext		column prefix cache
+@param[in]	i		index of ext->ext[]
+@param[in]	space		tablespace
+@param[in]	dfield		data field */
+static
+void
+row_ext_cache_fill(
+	row_ext_t*		ext,
+	ulint			i,
+	fil_space_t*		space,
+	const dfield_t*		dfield)
+{
+	const byte*	field	= static_cast<const byte*>(
+					dfield_get_data(dfield));
+	ulint		f_len	= dfield_get_len(dfield);
+	byte*		buf	= ext->buf + i * ext->max_len;
+
+	ut_ad(ext->max_len > 0);
+	ut_ad(i < ext->n_ext);
+	ut_ad(dfield_is_ext(dfield));
+	ut_a(f_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	if (UNIV_UNLIKELY(!memcmp(field_ref_zero,
+				  field + f_len - BTR_EXTERN_FIELD_REF_SIZE,
+				  BTR_EXTERN_FIELD_REF_SIZE))) {
+		/* The BLOB pointer is not set: we cannot fetch it */
+		ext->len[i] = 0;
+	} else {
+		if (ext->max_len == REC_VERSION_56_MAX_INDEX_COL_LEN
+		    && f_len > BTR_EXTERN_FIELD_REF_SIZE) {
+			/* In this case, the field is in B format or beyond,
+			(refer to the definition of row_ext_t.max_len)
+			and the field is already fill with prefix, otherwise
+			f_len would be BTR_EXTERN_FIELD_REF_SIZE.
+			So there is no need to re-read the prefix externally,
+			but just copy the local prefix to buf. Please note
+			if the ext->len[i] is zero, it means an error
+			as above. */
+			memcpy(buf, field, f_len - BTR_EXTERN_FIELD_REF_SIZE);
+			ext->len[i] = f_len - BTR_EXTERN_FIELD_REF_SIZE;
+		} else {
+			/* Fetch at most ext->max_len of the column.
+			The column should be non-empty.  However,
+			trx_rollback_all_recovered() may try to
+			access a half-deleted BLOB if the server previously
+			crashed during the execution of
+			btr_free_externally_stored_field(). */
+			ext->len[i] = btr_copy_externally_stored_field_prefix(
+				buf, ext->max_len, ext->zip_size,
+				field, f_len);
+		}
+	}
+}
+
+/********************************************************************//**
+Creates a cache of column prefixes of externally stored columns.
+@return own: column prefix cache */
+row_ext_t*
+row_ext_create(
+/*===========*/
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	const ulint*	ext,	/*!< in: col_no's of externally stored columns
+				in the InnoDB table object, as reported by
+				dict_col_get_no(); NOT relative to the records
+				in the clustered index */
+	const dict_table_t& table,	/*!< in: table */
+	const dtuple_t*	tuple,	/*!< in: data tuple containing the field
+				references of the externally stored
+				columns; must be indexed by col_no;
+				the clustered index record must be
+				covered by a lock or a page latch
+				to prevent deletion (rollback or purge). */
+	mem_heap_t*	heap)	/*!< in: heap where created */
+{
+	if (!table.space) {
+		return NULL;
+	}
+
+	ut_ad(n_ext > 0);
+
+	row_ext_t* ret = static_cast<row_ext_t*>(
+		mem_heap_alloc(heap,
+			       (sizeof *ret) + (n_ext - 1) * sizeof ret->len));
+
+	ret->n_ext = n_ext;
+	ret->ext = ext;
+	ret->max_len = DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(table.flags);
+	ret->zip_size = dict_tf_get_zip_size(table.flags);
+
+	ret->buf = static_cast<byte*>(
+		mem_heap_alloc(heap, n_ext * ret->max_len));
+
+	/* Fetch the BLOB prefixes */
+	for (ulint i = 0; i < n_ext; i++) {
+		const dfield_t*	dfield;
+
+		dfield = dtuple_get_nth_field(tuple, ext[i]);
+		row_ext_cache_fill(ret, i, table.space, dfield);
+	}
+
+	return(ret);
+}
diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc
new file mode 100644
index 00000000..17a2f034
--- /dev/null
+++ b/storage/innobase/row/row0ftsort.cc
@@ -0,0 +1,1791 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ftsort.cc
+Create Full Text Index with (parallel) merge sort
+
+Created 10/13/2010 Jimmy Yang
+*******************************************************/
+
+#include "row0ftsort.h"
+#include "dict0dict.h"
+#include "row0merge.h"
+#include "row0row.h"
+#include "btr0cur.h"
+#include "fts0plugin.h"
+#include "log0crypt.h"
+
+/** Read the next record to buffer N.
+@param N index into array of merge info structure */
+#define ROW_MERGE_READ_GET_NEXT(N)					\
+	do {								\
+		b[N] = row_merge_read_rec(				\
+			block[N], buf[N], b[N], index,			\
+			fd[N], &foffs[N], &mrec[N], offsets[N],		\
+			crypt_block[N], space);				\
+		if (UNIV_UNLIKELY(!b[N])) {				\
+			if (mrec[N]) {					\
+				goto exit;				\
+			}						\
+		}							\
+	} while (0)
+
+/** Parallel sort degree */
+ulong	fts_sort_pll_degree	= 2;
+
+/*********************************************************************//**
+Create a temporary "fts sort index" used to merge sort the
+tokenized doc string. The index has three "fields":
+
+1) Tokenized word,
+2) Doc ID (depend on number of records to sort, it can be a 4 bytes or 8 bytes
+integer value)
+3) Word's position in original doc.
+
+@see fts_create_one_index_table()
+
+@return dict_index_t structure for the fts sort index */
+dict_index_t*
+row_merge_create_fts_sort_index(
+/*============================*/
+	dict_index_t*	index,	/*!< in: Original FTS index
+				based on which this sort index
+				is created */
+	dict_table_t*	table,	/*!< in,out: table that FTS index
+				is being created on */
+	ibool*		opt_doc_id_size)
+				/*!< out: whether to use 4 bytes
+				instead of 8 bytes integer to
+				store Doc ID during sort */
+{
+	dict_index_t*   new_index;
+	dict_field_t*   field;
+	dict_field_t*   idx_field;
+	CHARSET_INFO*	charset;
+
+	// FIXME: This name shouldn't be hard coded here.
+	new_index = dict_mem_index_create(table, "tmp_fts_idx", DICT_FTS, 3);
+
+	new_index->id = index->id;
+	new_index->n_uniq = FTS_NUM_FIELDS_SORT;
+	new_index->n_def = FTS_NUM_FIELDS_SORT;
+	new_index->cached = TRUE;
+	new_index->parser = index->parser;
+
+	idx_field = dict_index_get_nth_field(index, 0);
+	charset = fts_index_get_charset(index);
+
+	/* The first field is on the Tokenized Word */
+	field = dict_index_get_nth_field(new_index, 0);
+	field->name = NULL;
+	field->prefix_len = 0;
+	field->descending = false;
+	field->col = static_cast<dict_col_t*>(
+		mem_heap_zalloc(new_index->heap, sizeof(dict_col_t)));
+	field->col->prtype = idx_field->col->prtype | DATA_NOT_NULL;
+	field->col->mtype = charset == &my_charset_latin1
+		? DATA_VARCHAR : DATA_VARMYSQL;
+	field->col->mbminlen = idx_field->col->mbminlen;
+	field->col->mbmaxlen = idx_field->col->mbmaxlen;
+	field->col->len = static_cast<uint16_t>(
+		HA_FT_MAXCHARLEN * field->col->mbmaxlen);
+
+	field->fixed_len = 0;
+
+	/* Doc ID */
+	field = dict_index_get_nth_field(new_index, 1);
+	field->name = NULL;
+	field->prefix_len = 0;
+	field->descending = false;
+	field->col = static_cast<dict_col_t*>(
+		mem_heap_zalloc(new_index->heap, sizeof(dict_col_t)));
+	field->col->mtype = DATA_INT;
+	*opt_doc_id_size = FALSE;
+
+	/* Check whether we can use 4 bytes instead of 8 bytes integer
+	field to hold the Doc ID, thus reduce the overall sort size */
+	if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+		/* If Doc ID column is being added by this create
+		index, then just check the number of rows in the table */
+		if (dict_table_get_n_rows(table) < MAX_DOC_ID_OPT_VAL) {
+			*opt_doc_id_size = TRUE;
+		}
+	} else {
+		doc_id_t	max_doc_id;
+
+		/* If the Doc ID column is supplied by user, then
+		check the maximum Doc ID in the table */
+		max_doc_id = fts_get_max_doc_id((dict_table_t*) table);
+
+		if (max_doc_id && max_doc_id < MAX_DOC_ID_OPT_VAL) {
+			*opt_doc_id_size = TRUE;
+		}
+	}
+
+	if (*opt_doc_id_size) {
+		field->col->len = sizeof(ib_uint32_t);
+		field->fixed_len = sizeof(ib_uint32_t);
+	} else {
+		field->col->len = FTS_DOC_ID_LEN;
+		field->fixed_len = FTS_DOC_ID_LEN;
+	}
+
+	field->col->prtype = DATA_NOT_NULL | DATA_BINARY_TYPE;
+
+	/* The third field is on the word's position in the original doc */
+	field = dict_index_get_nth_field(new_index, 2);
+	field->name = NULL;
+	field->prefix_len = 0;
+	field->descending = false;
+	field->col = static_cast<dict_col_t*>(
+		mem_heap_zalloc(new_index->heap, sizeof(dict_col_t)));
+	field->col->mtype = DATA_INT;
+	field->col->len = 4 ;
+	field->fixed_len = 4;
+	field->col->prtype = DATA_NOT_NULL;
+
+	return(new_index);
+}
+
+/** Initialize FTS parallel sort structures.
+@param[in]	trx		transaction
+@param[in,out]	dup		descriptor of FTS index being created
+@param[in,out]	new_table	table where indexes are created
+@param[in]	opt_doc_id_size	whether to use 4 bytes instead of 8 bytes
+				integer to store Doc ID during sort
+@param[in]	old_zip_size	page size of the old table during alter
+@param[out]	psort		parallel sort info to be instantiated
+@param[out]	merge		parallel merge info to be instantiated
+@return true if all successful */
+bool
+row_fts_psort_info_init(
+	trx_t*		trx,
+	row_merge_dup_t*dup,
+	dict_table_t*	new_table,
+	bool		opt_doc_id_size,
+	ulint		old_zip_size,
+	fts_psort_t**	psort,
+	fts_psort_t**	merge)
+{
+	ulint			i;
+	ulint			j;
+	fts_psort_common_t*	common_info = NULL;
+	fts_psort_t*		psort_info = NULL;
+	fts_psort_t*		merge_info = NULL;
+	ulint			block_size;
+	ibool			ret = TRUE;
+	ut_ad(ut_is_2pow(old_zip_size));
+
+	block_size = 3 * srv_sort_buf_size;
+
+	*psort = psort_info = static_cast<fts_psort_t*>(ut_zalloc_nokey(
+		 fts_sort_pll_degree * sizeof *psort_info));
+
+	if (!psort_info) {
+		ut_free(dup);
+		return(FALSE);
+	}
+
+	/* Common Info for all sort threads */
+	common_info = static_cast<fts_psort_common_t*>(
+		ut_malloc_nokey(sizeof *common_info));
+
+	if (!common_info) {
+		ut_free(dup);
+		ut_free(psort_info);
+		return(FALSE);
+	}
+
+	common_info->dup = dup;
+	common_info->new_table = new_table;
+	common_info->old_zip_size = old_zip_size;
+	common_info->trx = trx;
+	common_info->all_info = psort_info;
+	pthread_cond_init(&common_info->sort_cond, nullptr);
+	common_info->opt_doc_id_size = opt_doc_id_size;
+
+	ut_ad(trx->mysql_thd != NULL);
+	const char*	path = thd_innodb_tmpdir(trx->mysql_thd);
+	/* There will be FTS_NUM_AUX_INDEX number of "sort buckets" for
+	each parallel sort thread. Each "sort bucket" holds records for
+	a particular "FTS index partition" */
+	for (j = 0; j < fts_sort_pll_degree; j++) {
+
+		UT_LIST_INIT(
+			psort_info[j].fts_doc_list, &fts_doc_item_t::doc_list);
+
+		for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+
+			psort_info[j].merge_file[i] =
+				 static_cast<merge_file_t*>(
+					ut_zalloc_nokey(sizeof(merge_file_t)));
+
+			if (!psort_info[j].merge_file[i]) {
+				ret = FALSE;
+				goto func_exit;
+			}
+
+			psort_info[j].merge_buf[i] = row_merge_buf_create(
+				dup->index);
+
+			if (row_merge_file_create(psort_info[j].merge_file[i],
+						  path) == OS_FILE_CLOSED) {
+				goto func_exit;
+			}
+
+			/* Need to align memory for O_DIRECT write */
+			psort_info[j].merge_block[i] =
+				static_cast<row_merge_block_t*>(
+					aligned_malloc(block_size, 1024));
+
+			if (!psort_info[j].merge_block[i]) {
+				ret = FALSE;
+				goto func_exit;
+			}
+
+			/* If tablespace is encrypted, allocate additional buffer for
+			encryption/decryption. */
+			if (srv_encrypt_log) {
+				/* Need to align memory for O_DIRECT write */
+				psort_info[j].crypt_block[i] =
+					static_cast<row_merge_block_t*>(
+						aligned_malloc(block_size,
+							       1024));
+
+				if (!psort_info[j].crypt_block[i]) {
+					ret = FALSE;
+					goto func_exit;
+				}
+			} else {
+				psort_info[j].crypt_block[i] = NULL;
+			}
+		}
+
+		psort_info[j].child_status = 0;
+		psort_info[j].state = 0;
+		psort_info[j].psort_common = common_info;
+		psort_info[j].error = DB_SUCCESS;
+		psort_info[j].memory_used = 0;
+		mysql_mutex_init(0, &psort_info[j].mutex, nullptr);
+	}
+
+	/* Initialize merge_info structures parallel merge and insert
+	into auxiliary FTS tables (FTS_INDEX_TABLE) */
+	*merge = merge_info = static_cast<fts_psort_t*>(
+		ut_malloc_nokey(FTS_NUM_AUX_INDEX * sizeof *merge_info));
+
+	for (j = 0; j < FTS_NUM_AUX_INDEX; j++) {
+
+		merge_info[j].child_status = 0;
+		merge_info[j].state = 0;
+		merge_info[j].psort_common = common_info;
+	}
+
+func_exit:
+	if (!ret) {
+		row_fts_psort_info_destroy(psort_info, merge_info);
+	}
+
+	return(ret);
+}
+/*********************************************************************//**
+Clean up and deallocate FTS parallel sort structures, and close the
+merge sort files  */
+void
+row_fts_psort_info_destroy(
+/*=======================*/
+	fts_psort_t*	psort_info,	/*!< parallel sort info */
+	fts_psort_t*	merge_info)	/*!< parallel merge info */
+{
+	ulint	i;
+	ulint	j;
+
+	if (psort_info) {
+		for (j = 0; j < fts_sort_pll_degree; j++) {
+			for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+				if (psort_info[j].merge_file[i]) {
+					row_merge_file_destroy(
+						psort_info[j].merge_file[i]);
+				}
+
+				aligned_free(psort_info[j].merge_block[i]);
+				ut_free(psort_info[j].merge_file[i]);
+				aligned_free(psort_info[j].crypt_block[i]);
+			}
+
+			mysql_mutex_destroy(&psort_info[j].mutex);
+		}
+
+		pthread_cond_destroy(&merge_info[0].psort_common->sort_cond);
+		ut_free(merge_info[0].psort_common->dup);
+		ut_free(merge_info[0].psort_common);
+		ut_free(psort_info);
+	}
+
+	ut_free(merge_info);
+}
+/*********************************************************************//**
+Free up merge buffers when merge sort is done */
+void
+row_fts_free_pll_merge_buf(
+/*=======================*/
+	fts_psort_t*	psort_info)	/*!< in: parallel sort info */
+{
+	ulint	j;
+	ulint	i;
+
+	if (!psort_info) {
+		return;
+	}
+
+	for (j = 0; j < fts_sort_pll_degree; j++) {
+		for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+			row_merge_buf_free(psort_info[j].merge_buf[i]);
+		}
+	}
+
+	return;
+}
+
+/*********************************************************************//**
+FTS plugin parser 'myql_add_word' callback function for row merge.
+Refer to 'st_mysql_ftparser_param' for more detail.
+@return always returns 0 */
+static
+int
+row_merge_fts_doc_add_word_for_parser(
+/*==================================*/
+	MYSQL_FTPARSER_PARAM	*param,		/* in: parser paramter */
+	const char		*word,		/* in: token word */
+	int			word_len,	/* in: word len */
+	MYSQL_FTPARSER_BOOLEAN_INFO*	boolean_info)	/* in: boolean info */
+{
+	fts_string_t		str;
+	fts_tokenize_ctx_t*	t_ctx;
+	row_fts_token_t*	fts_token;
+	byte*			ptr;
+
+	ut_ad(param);
+	ut_ad(param->mysql_ftparam);
+	ut_ad(word);
+	ut_ad(boolean_info);
+
+	t_ctx = static_cast<fts_tokenize_ctx_t*>(param->mysql_ftparam);
+	ut_ad(t_ctx);
+
+	str.f_str = (byte*)(word);
+	str.f_len = ulint(word_len);
+	str.f_n_char = fts_get_token_size(
+		(CHARSET_INFO*)param->cs, word, ulint(word_len));
+
+	/* JAN: TODO: MySQL 5.7 FTS
+	ut_ad(boolean_info->position >= 0);
+	*/
+
+	ptr = static_cast<byte*>(ut_malloc_nokey(sizeof(row_fts_token_t)
+			+ sizeof(fts_string_t) + str.f_len));
+	fts_token = reinterpret_cast<row_fts_token_t*>(ptr);
+	fts_token->text = reinterpret_cast<fts_string_t*>(
+			ptr + sizeof(row_fts_token_t));
+	fts_token->text->f_str = static_cast<byte*>(
+			ptr + sizeof(row_fts_token_t) + sizeof(fts_string_t));
+
+	fts_token->text->f_len = str.f_len;
+	fts_token->text->f_n_char = str.f_n_char;
+	memcpy(fts_token->text->f_str, str.f_str, str.f_len);
+
+	/* JAN: TODO: MySQL 5.7 FTS
+	fts_token->position = boolean_info->position;
+	*/
+
+	/* Add token to list */
+	UT_LIST_ADD_LAST(t_ctx->fts_token_list, fts_token);
+
+	return(0);
+}
+
+/*********************************************************************//**
+Tokenize by fts plugin parser */
+static
+void
+row_merge_fts_doc_tokenize_by_parser(
+/*=================================*/
+	fts_doc_t*		doc,	/* in: doc to tokenize */
+	st_mysql_ftparser*	parser,	/* in: plugin parser instance */
+	fts_tokenize_ctx_t*	t_ctx)	/* in/out: tokenize ctx instance */
+{
+	MYSQL_FTPARSER_PARAM	param;
+
+	ut_a(parser);
+
+	/* Set paramters for param */
+	param.mysql_parse = fts_tokenize_document_internal;
+	param.mysql_add_word = row_merge_fts_doc_add_word_for_parser;
+	param.mysql_ftparam = t_ctx;
+	param.cs = doc->charset;
+	param.doc = reinterpret_cast<char*>(doc->text.f_str);
+	param.length = static_cast<int>(doc->text.f_len);
+	param.mode= MYSQL_FTPARSER_SIMPLE_MODE;
+
+	PARSER_INIT(parser, &param);
+	/* We assume parse returns successfully here. */
+	parser->parse(&param);
+	PARSER_DEINIT(parser, &param);
+}
+
+/*********************************************************************//**
+Tokenize incoming text data and add to the sort buffer.
+@see row_merge_buf_encode()
+@return	TRUE if the record passed, FALSE if out of space */
+static
+ibool
+row_merge_fts_doc_tokenize(
+/*=======================*/
+	row_merge_buf_t**	sort_buf,	/*!< in/out: sort buffer */
+	doc_id_t		doc_id,		/*!< in: Doc ID */
+	fts_doc_t*		doc,		/*!< in: Doc to be tokenized */
+	merge_file_t**		merge_file,	/*!< in/out: merge file */
+	ibool			opt_doc_id_size,/*!< in: whether to use 4 bytes
+						instead of 8 bytes integer to
+						store Doc ID during sort*/
+	fts_tokenize_ctx_t*	t_ctx)          /*!< in/out: tokenize context */
+{
+	ulint		inc = 0;
+	fts_string_t	str;
+	ulint		len;
+	row_merge_buf_t* buf;
+	dfield_t*	field;
+	fts_string_t	t_str;
+	ibool		buf_full = FALSE;
+	byte		str_buf[FTS_MAX_WORD_LEN + 1];
+	ulint		data_size[FTS_NUM_AUX_INDEX];
+	ulint		n_tuple[FTS_NUM_AUX_INDEX];
+	st_mysql_ftparser*	parser;
+
+	t_str.f_n_char = 0;
+	t_ctx->buf_used = 0;
+
+	memset(n_tuple, 0, FTS_NUM_AUX_INDEX * sizeof(ulint));
+	memset(data_size, 0, FTS_NUM_AUX_INDEX * sizeof(ulint));
+
+	parser = sort_buf[0]->index->parser;
+
+	/* Tokenize the data and add each word string, its corresponding
+	doc id and position to sort buffer */
+	while (parser
+               ? (!t_ctx->processed_len
+                  || UT_LIST_GET_LEN(t_ctx->fts_token_list))
+               : t_ctx->processed_len < doc->text.f_len) {
+		ulint		idx = 0;
+		ulint		cur_len;
+		doc_id_t	write_doc_id;
+		row_fts_token_t* fts_token = NULL;
+
+		if (parser != NULL) {
+			if (t_ctx->processed_len == 0) {
+				UT_LIST_INIT(t_ctx->fts_token_list, &row_fts_token_t::token_list);
+
+				/* Parse the whole doc and cache tokens */
+				row_merge_fts_doc_tokenize_by_parser(doc,
+					parser, t_ctx);
+
+				/* Just indictate we have parsed all the word */
+				t_ctx->processed_len += 1;
+			}
+
+			/* Then get a token */
+			fts_token = UT_LIST_GET_FIRST(t_ctx->fts_token_list);
+			if (fts_token) {
+				str.f_len = fts_token->text->f_len;
+				str.f_n_char = fts_token->text->f_n_char;
+				str.f_str = fts_token->text->f_str;
+			} else {
+				ut_ad(UT_LIST_GET_LEN(t_ctx->fts_token_list) == 0);
+				/* Reach the end of the list */
+				t_ctx->processed_len = doc->text.f_len;
+				break;
+			}
+		} else {
+			inc = innobase_mysql_fts_get_token(
+				doc->charset,
+				doc->text.f_str + t_ctx->processed_len,
+				doc->text.f_str + doc->text.f_len, &str);
+
+			ut_a(inc > 0);
+		}
+
+		/* Ignore string whose character number is less than
+		"fts_min_token_size" or more than "fts_max_token_size" */
+		if (!fts_check_token(&str, NULL, NULL)) {
+			if (parser != NULL) {
+				UT_LIST_REMOVE(t_ctx->fts_token_list, fts_token);
+				ut_free(fts_token);
+			} else {
+				t_ctx->processed_len += inc;
+			}
+
+			continue;
+		}
+
+		t_str.f_len = innobase_fts_casedn_str(
+			doc->charset, (char*) str.f_str, str.f_len,
+			(char*) &str_buf, FTS_MAX_WORD_LEN + 1);
+
+		t_str.f_str = (byte*) &str_buf;
+
+		/* if "cached_stopword" is defined, ignore words in the
+		stopword list */
+		if (!fts_check_token(&str, t_ctx->cached_stopword,
+				     doc->charset)) {
+			if (parser != NULL) {
+				UT_LIST_REMOVE(t_ctx->fts_token_list, fts_token);
+				ut_free(fts_token);
+			} else {
+				t_ctx->processed_len += inc;
+			}
+
+			continue;
+		}
+
+		/* There are FTS_NUM_AUX_INDEX auxiliary tables, find
+		out which sort buffer to put this word record in */
+		t_ctx->buf_used = fts_select_index(
+			doc->charset, t_str.f_str, t_str.f_len);
+
+		buf = sort_buf[t_ctx->buf_used];
+
+		ut_a(t_ctx->buf_used < FTS_NUM_AUX_INDEX);
+		idx = t_ctx->buf_used;
+
+		mtuple_t* mtuple = &buf->tuples[buf->n_tuples + n_tuple[idx]];
+
+		field = mtuple->fields = static_cast<dfield_t*>(
+			mem_heap_alloc(buf->heap,
+				       FTS_NUM_FIELDS_SORT * sizeof *field));
+
+		/* The first field is the tokenized word */
+		dfield_set_data(field, t_str.f_str, t_str.f_len);
+		len = dfield_get_len(field);
+
+		dict_col_copy_type(dict_index_get_nth_col(buf->index, 0), &field->type);
+		field->type.prtype |= DATA_NOT_NULL;
+		ut_ad(len <= field->type.len);
+
+		/* For the temporary file, row_merge_buf_encode() uses
+		1 byte for representing the number of extra_size bytes.
+		This number will always be 1, because for this 3-field index
+		consisting of one variable-size column, extra_size will always
+		be 1 or 2, which can be encoded in one byte.
+
+		The extra_size is 1 byte if the length of the
+		variable-length column is less than 128 bytes or the
+		maximum length is less than 256 bytes. */
+
+		/* One variable length column, word with its lenght less than
+		fts_max_token_size, add one extra size and one extra byte.
+
+		Since the max length for FTS token now is larger than 255,
+		so we will need to signify length byte itself, so only 1 to 128
+		bytes can be used for 1 bytes, larger than that 2 bytes. */
+		if (len < 128 || field->type.len < 256) {
+			/* Extra size is one byte. */
+			cur_len = 2 + len;
+		} else {
+			/* Extra size is two bytes. */
+			cur_len = 3 + len;
+		}
+
+		dfield_dup(field, buf->heap);
+		field++;
+
+		/* The second field is the Doc ID */
+
+		ib_uint32_t	doc_id_32_bit;
+
+		if (!opt_doc_id_size) {
+			fts_write_doc_id((byte*) &write_doc_id, doc_id);
+
+			dfield_set_data(
+				field, &write_doc_id, sizeof(write_doc_id));
+		} else {
+			mach_write_to_4(
+				(byte*) &doc_id_32_bit, (ib_uint32_t) doc_id);
+
+			dfield_set_data(
+				field, &doc_id_32_bit, sizeof(doc_id_32_bit));
+		}
+
+		len = field->len;
+		ut_ad(len == FTS_DOC_ID_LEN || len == sizeof(ib_uint32_t));
+
+		field->type.mtype = DATA_INT;
+		field->type.prtype = DATA_NOT_NULL | DATA_BINARY_TYPE;
+		field->type.len = static_cast<uint16_t>(field->len);
+		field->type.mbminlen = 0;
+		field->type.mbmaxlen = 0;
+
+		cur_len += len;
+		dfield_dup(field, buf->heap);
+
+		++field;
+
+		/* The third field is the position.
+		MySQL 5.7 changed the fulltext parser plugin interface
+		by adding MYSQL_FTPARSER_BOOLEAN_INFO::position.
+		Below we assume that the field is always 0. */
+		ulint	pos = t_ctx->init_pos;
+		byte		position[4];
+		if (parser == NULL) {
+			pos += t_ctx->processed_len + inc - str.f_len;
+		}
+		len = 4;
+		mach_write_to_4(position, pos);
+		dfield_set_data(field, &position, len);
+
+		field->type.mtype = DATA_INT;
+		field->type.prtype = DATA_NOT_NULL;
+		field->type.len = 4;
+		field->type.mbminlen = 0;
+		field->type.mbmaxlen = 0;
+		cur_len += len;
+		dfield_dup(field, buf->heap);
+
+		/* Reserve one byte for the end marker of row_merge_block_t */
+		if (buf->total_size + data_size[idx] + cur_len
+		    >= srv_sort_buf_size - 1) {
+
+			buf_full = TRUE;
+			break;
+		}
+
+		/* Increment the number of tuples */
+		n_tuple[idx]++;
+		if (parser != NULL) {
+			UT_LIST_REMOVE(t_ctx->fts_token_list, fts_token);
+			ut_free(fts_token);
+		} else {
+			t_ctx->processed_len += inc;
+		}
+		data_size[idx] += cur_len;
+	}
+
+	/* Update the data length and the number of new word tuples
+	added in this round of tokenization */
+	for (ulint i = 0; i <  FTS_NUM_AUX_INDEX; i++) {
+		/* The computation of total_size below assumes that no
+		delete-mark flags will be stored and that all fields
+		are NOT NULL and fixed-length. */
+
+		sort_buf[i]->total_size += data_size[i];
+
+		sort_buf[i]->n_tuples += n_tuple[i];
+
+		merge_file[i]->n_rec += n_tuple[i];
+		t_ctx->rows_added[i] += n_tuple[i];
+	}
+
+	if (!buf_full) {
+		/* we pad one byte between text accross two fields */
+		t_ctx->init_pos += doc->text.f_len + 1;
+	}
+
+	return(!buf_full);
+}
+
+/*********************************************************************//**
+Get next doc item from fts_doc_list */
+UNIV_INLINE
+void
+row_merge_fts_get_next_doc_item(
+/*============================*/
+	fts_psort_t*		psort_info,	/*!< in: psort_info */
+	fts_doc_item_t**	doc_item)	/*!< in/out: doc item */
+{
+	if (*doc_item != NULL) {
+		ut_free(*doc_item);
+	}
+
+	mysql_mutex_lock(&psort_info->mutex);
+
+	*doc_item = UT_LIST_GET_FIRST(psort_info->fts_doc_list);
+	if (*doc_item != NULL) {
+		UT_LIST_REMOVE(psort_info->fts_doc_list, *doc_item);
+
+		ut_ad(psort_info->memory_used >= sizeof(fts_doc_item_t)
+		      + (*doc_item)->field->len);
+		psort_info->memory_used -= sizeof(fts_doc_item_t)
+			+ (*doc_item)->field->len;
+	}
+
+	mysql_mutex_unlock(&psort_info->mutex);
+}
+
+/*********************************************************************//**
+Function performs parallel tokenization of the incoming doc strings.
+It also performs the initial in memory sort of the parsed records.
+*/
+static
+void fts_parallel_tokenization(
+/*======================*/
+	void*		arg)	/*!< in: psort_info for the thread */
+{
+	fts_psort_t*		psort_info = (fts_psort_t*) arg;
+	ulint			i;
+	fts_doc_item_t*		doc_item = NULL;
+	row_merge_buf_t**	buf;
+	ibool			processed = FALSE;
+	merge_file_t**		merge_file;
+	row_merge_block_t**	block;
+	row_merge_block_t**	crypt_block;
+	pfs_os_file_t		tmpfd[FTS_NUM_AUX_INDEX];
+	ulint			mycount[FTS_NUM_AUX_INDEX];
+	ulint			num_doc_processed = 0;
+	doc_id_t		last_doc_id = 0;
+	mem_heap_t*		blob_heap = NULL;
+	fts_doc_t		doc;
+	dict_table_t*		table = psort_info->psort_common->new_table;
+	fts_tokenize_ctx_t	t_ctx;
+	ulint			retried = 0;
+	dberr_t			error = DB_SUCCESS;
+
+	ut_ad(psort_info->psort_common->trx->mysql_thd != NULL);
+
+	/* const char*		path = thd_innodb_tmpdir(
+		psort_info->psort_common->trx->mysql_thd);
+	*/
+
+	ut_ad(psort_info->psort_common->trx->mysql_thd != NULL);
+
+	const char*		path = thd_innodb_tmpdir(
+		psort_info->psort_common->trx->mysql_thd);
+
+	ut_ad(psort_info);
+
+	buf = psort_info->merge_buf;
+	merge_file = psort_info->merge_file;
+	blob_heap = mem_heap_create(512);
+	memset(&doc, 0, sizeof(doc));
+	memset(mycount, 0, FTS_NUM_AUX_INDEX * sizeof(int));
+
+	doc.charset = fts_index_get_charset(
+		psort_info->psort_common->dup->index);
+
+	block = psort_info->merge_block;
+	crypt_block = psort_info->crypt_block;
+
+	const ulint zip_size = psort_info->psort_common->old_zip_size;
+
+	row_merge_fts_get_next_doc_item(psort_info, &doc_item);
+
+	t_ctx.cached_stopword = table->fts->cache->stopword_info.cached_stopword;
+	processed = TRUE;
+loop:
+	while (doc_item) {
+		dfield_t*	dfield = doc_item->field;
+
+		last_doc_id = doc_item->doc_id;
+
+		ut_ad (dfield->data != NULL
+		       && dfield_get_len(dfield) != UNIV_SQL_NULL);
+
+		/* If finish processing the last item, update "doc" with
+		strings in the doc_item, otherwise continue processing last
+		item */
+		if (processed) {
+			byte*		data;
+			ulint		data_len;
+
+			dfield = doc_item->field;
+			data = static_cast<byte*>(dfield_get_data(dfield));
+			data_len = dfield_get_len(dfield);
+
+			if (dfield_is_ext(dfield)) {
+				doc.text.f_str =
+					btr_copy_externally_stored_field(
+						&doc.text.f_len, data,
+						zip_size, data_len, blob_heap);
+			} else {
+				doc.text.f_str = data;
+				doc.text.f_len = data_len;
+			}
+
+			doc.tokens = 0;
+			t_ctx.processed_len = 0;
+		} else {
+			/* Not yet finish processing the "doc" on hand,
+			continue processing it */
+			ut_ad(doc.text.f_str);
+			ut_ad(buf[0]->index->parser
+			      || t_ctx.processed_len < doc.text.f_len);
+		}
+
+		processed = row_merge_fts_doc_tokenize(
+			buf, doc_item->doc_id, &doc,
+			merge_file, psort_info->psort_common->opt_doc_id_size,
+			&t_ctx);
+
+		/* Current sort buffer full, need to recycle */
+		if (!processed) {
+			ut_ad(buf[0]->index->parser
+			      || t_ctx.processed_len < doc.text.f_len);
+			ut_ad(t_ctx.rows_added[t_ctx.buf_used]);
+			break;
+		}
+
+		num_doc_processed++;
+
+		if (UNIV_UNLIKELY(fts_enable_diag_print)
+		    && num_doc_processed % 10000 == 1) {
+			ib::info() << "Number of documents processed: "
+				<< num_doc_processed;
+#ifdef FTS_INTERNAL_DIAG_PRINT
+			for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+				ib::info() << "ID " << psort_info->psort_id
+					<< ", partition " << i << ", word "
+					<< mycount[i];
+			}
+#endif
+		}
+
+		mem_heap_empty(blob_heap);
+
+		row_merge_fts_get_next_doc_item(psort_info, &doc_item);
+
+		if (doc_item && last_doc_id != doc_item->doc_id) {
+			t_ctx.init_pos = 0;
+		}
+	}
+
+	/* If we run out of current sort buffer, need to sort
+	and flush the sort buffer to disk */
+	if (t_ctx.rows_added[t_ctx.buf_used] && !processed) {
+		row_merge_buf_sort(buf[t_ctx.buf_used], NULL);
+		row_merge_buf_write(buf[t_ctx.buf_used],
+#ifndef DBUG_OFF
+				    merge_file[t_ctx.buf_used],
+#endif
+				    block[t_ctx.buf_used]);
+
+		if (!row_merge_write(merge_file[t_ctx.buf_used]->fd,
+				     merge_file[t_ctx.buf_used]->offset++,
+				     block[t_ctx.buf_used],
+				     crypt_block[t_ctx.buf_used],
+				     table->space_id)) {
+			error = DB_TEMP_FILE_WRITE_FAIL;
+			goto func_exit;
+		}
+
+		MEM_UNDEFINED(block[t_ctx.buf_used], srv_sort_buf_size);
+		buf[t_ctx.buf_used] = row_merge_buf_empty(buf[t_ctx.buf_used]);
+		mycount[t_ctx.buf_used] += t_ctx.rows_added[t_ctx.buf_used];
+		t_ctx.rows_added[t_ctx.buf_used] = 0;
+
+		ut_a(doc_item);
+		goto loop;
+	}
+
+	/* Parent done scanning, and if finish processing all the docs, exit */
+	if (psort_info->state == FTS_PARENT_COMPLETE) {
+		if (UT_LIST_GET_LEN(psort_info->fts_doc_list) == 0) {
+			goto exit;
+		} else if (retried > 10000) {
+			ut_ad(!doc_item);
+			/* retried too many times and cannot get new record */
+			ib::error() << "FTS parallel sort processed "
+				<< num_doc_processed
+				<< " records, the sort queue has "
+				<< UT_LIST_GET_LEN(psort_info->fts_doc_list)
+				<< " records. But sort cannot get the next"
+				" records during alter table " << table->name;
+			goto exit;
+		}
+	} else if (psort_info->state == FTS_PARENT_EXITING) {
+		/* Parent abort */
+		goto func_exit;
+	}
+
+	if (doc_item == NULL) {
+		std::this_thread::yield();
+	}
+
+	row_merge_fts_get_next_doc_item(psort_info, &doc_item);
+
+	if (doc_item != NULL) {
+		if (last_doc_id != doc_item->doc_id) {
+			t_ctx.init_pos = 0;
+		}
+
+		retried = 0;
+	} else if (psort_info->state == FTS_PARENT_COMPLETE) {
+		retried++;
+	}
+
+	goto loop;
+
+exit:
+	/* Do a final sort of the last (or latest) batch of records
+	in block memory. Flush them to temp file if records cannot
+	be hold in one block memory */
+	for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+		if (t_ctx.rows_added[i]) {
+			row_merge_buf_sort(buf[i], NULL);
+			row_merge_buf_write(buf[i],
+#ifndef DBUG_OFF
+					    merge_file[i],
+#endif
+					    block[i]);
+
+			/* Write to temp file, only if records have
+			been flushed to temp file before (offset > 0):
+			The pseudo code for sort is following:
+
+				while (there are rows) {
+					tokenize rows, put result in block[]
+					if (block[] runs out) {
+						sort rows;
+						write to temp file with
+						row_merge_write();
+						offset++;
+					}
+				}
+
+				# write out the last batch
+				if (offset > 0) {
+					row_merge_write();
+					offset++;
+				} else {
+					# no need to write anything
+					offset stay as 0
+				}
+
+			so if merge_file[i]->offset is 0 when we come to
+			here as the last batch, this means rows have
+			never flush to temp file, it can be held all in
+			memory */
+			if (merge_file[i]->offset != 0) {
+				if (!row_merge_write(merge_file[i]->fd,
+						merge_file[i]->offset++,
+						block[i],
+						crypt_block[i],
+						table->space_id)) {
+					error = DB_TEMP_FILE_WRITE_FAIL;
+					goto func_exit;
+				}
+
+#ifdef HAVE_valgrind
+				MEM_UNDEFINED(block[i], srv_sort_buf_size);
+
+				if (crypt_block[i]) {
+					MEM_UNDEFINED(crypt_block[i],
+						      srv_sort_buf_size);
+				}
+#endif /* HAVE_valgrind */
+			}
+
+			buf[i] = row_merge_buf_empty(buf[i]);
+			t_ctx.rows_added[i] = 0;
+		}
+	}
+
+	if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+		DEBUG_FTS_SORT_PRINT("  InnoDB_FTS: start merge sort\n");
+	}
+
+	for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+		if (!merge_file[i]->offset) {
+			continue;
+		}
+
+		tmpfd[i] = row_merge_file_create_low(path);
+		if (tmpfd[i] == OS_FILE_CLOSED) {
+			error = DB_OUT_OF_MEMORY;
+			goto func_exit;
+		}
+
+		error = row_merge_sort(psort_info->psort_common->trx,
+				       psort_info->psort_common->dup,
+				       merge_file[i], block[i], &tmpfd[i],
+				       false, 0.0/* pct_progress */, 0.0/* pct_cost */,
+				       crypt_block[i], table->space_id);
+
+		if (error != DB_SUCCESS) {
+			row_merge_file_destroy_low(tmpfd[i]);
+			goto func_exit;
+		}
+
+		row_merge_file_destroy_low(tmpfd[i]);
+	}
+
+func_exit:
+	if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+		DEBUG_FTS_SORT_PRINT("  InnoDB_FTS: complete merge sort\n");
+	}
+
+	mem_heap_free(blob_heap);
+
+	mysql_mutex_lock(&psort_info->mutex);
+	psort_info->error = error;
+	mysql_mutex_unlock(&psort_info->mutex);
+
+	if (UT_LIST_GET_LEN(psort_info->fts_doc_list) > 0) {
+		/* child can exit either with error or told by parent. */
+		ut_ad(error != DB_SUCCESS
+		      || psort_info->state == FTS_PARENT_EXITING);
+	}
+
+	/* Free fts doc list in case of error. */
+	do {
+		row_merge_fts_get_next_doc_item(psort_info, &doc_item);
+	} while (doc_item != NULL);
+
+	mysql_mutex_lock(&psort_info->mutex);
+	psort_info->child_status = FTS_CHILD_COMPLETE;
+	pthread_cond_signal(&psort_info->psort_common->sort_cond);
+	mysql_mutex_unlock(&psort_info->mutex);
+}
+
+/*********************************************************************//**
+Start the parallel tokenization and parallel merge sort */
+void
+row_fts_start_psort(
+/*================*/
+	fts_psort_t*	psort_info)	/*!< parallel sort structure */
+{
+	ulint		i = 0;
+
+	for (i = 0; i < fts_sort_pll_degree; i++) {
+		psort_info[i].psort_id = i;
+		psort_info[i].task =
+			new tpool::waitable_task(fts_parallel_tokenization,&psort_info[i]);
+		srv_thread_pool->submit_task(psort_info[i].task);
+	}
+}
+
+/*********************************************************************//**
+Function performs the merge and insertion of the sorted records. */
+static
+void
+fts_parallel_merge(
+/*===============*/
+	void*		arg)		/*!< in: parallel merge info */
+{
+	fts_psort_t*	psort_info = (fts_psort_t*) arg;
+	ulint		id;
+
+	ut_ad(psort_info);
+
+	id = psort_info->psort_id;
+
+	row_fts_merge_insert(psort_info->psort_common->dup->index,
+			     psort_info->psort_common->new_table,
+			     psort_info->psort_common->all_info, id);
+}
+
+/*********************************************************************//**
+Kick off the parallel merge and insert thread */
+void
+row_fts_start_parallel_merge(
+/*=========================*/
+	fts_psort_t*	merge_info)	/*!< in: parallel sort info */
+{
+	ulint		i = 0;
+
+	/* Kick off merge/insert tasks */
+	for (i = 0; i <  FTS_NUM_AUX_INDEX; i++) {
+		merge_info[i].psort_id = i;
+		merge_info[i].child_status = 0;
+
+		merge_info[i].task = new tpool::waitable_task(
+			fts_parallel_merge,
+			(void*) &merge_info[i]);
+		srv_thread_pool->submit_task(merge_info[i].task);
+	}
+}
+
+/**
+Write out a single word's data as new entry/entries in the INDEX table.
+@param[in]	ins_ctx	insert context
+@param[in]	word	word string
+@param[in]	node	node colmns
+@return	DB_SUCCUESS if insertion runs fine, otherwise error code */
+static
+dberr_t
+row_merge_write_fts_node(
+	const	fts_psort_insert_t*	ins_ctx,
+	const	fts_string_t*		word,
+	const	fts_node_t*		node)
+{
+	dtuple_t*	tuple;
+	dfield_t*	field;
+	dberr_t		ret = DB_SUCCESS;
+	doc_id_t	write_first_doc_id[8];
+	doc_id_t	write_last_doc_id[8];
+	ib_uint32_t	write_doc_count;
+
+	tuple = ins_ctx->tuple;
+
+	/* The first field is the tokenized word */
+	field = dtuple_get_nth_field(tuple, 0);
+	dfield_set_data(field, word->f_str, word->f_len);
+
+	/* The second field is first_doc_id */
+	field = dtuple_get_nth_field(tuple, 1);
+	fts_write_doc_id((byte*)&write_first_doc_id, node->first_doc_id);
+	dfield_set_data(field, &write_first_doc_id, sizeof(doc_id_t));
+
+	/* The third and fourth fileds(TRX_ID, ROLL_PTR) are filled already.*/
+	/* The fifth field is last_doc_id */
+	field = dtuple_get_nth_field(tuple, 4);
+	fts_write_doc_id((byte*)&write_last_doc_id, node->last_doc_id);
+	dfield_set_data(field, &write_last_doc_id, sizeof(doc_id_t));
+
+	/* The sixth field is doc_count */
+	field = dtuple_get_nth_field(tuple, 5);
+	mach_write_to_4((byte*)&write_doc_count, (ib_uint32_t)node->doc_count);
+	dfield_set_data(field, &write_doc_count, sizeof(ib_uint32_t));
+
+	/* The seventh field is ilist */
+	field = dtuple_get_nth_field(tuple, 6);
+	dfield_set_data(field, node->ilist, node->ilist_size);
+
+	ret = ins_ctx->btr_bulk->insert(tuple);
+
+	return(ret);
+}
+
+/********************************************************************//**
+Insert processed FTS data to auxillary index tables.
+@return DB_SUCCESS if insertion runs fine */
+static MY_ATTRIBUTE((nonnull))
+dberr_t
+row_merge_write_fts_word(
+/*=====================*/
+	fts_psort_insert_t*	ins_ctx,	/*!< in: insert context */
+	fts_tokenizer_word_t*	word)		/*!< in: sorted and tokenized
+						word */
+{
+	dberr_t	ret = DB_SUCCESS;
+
+	ut_ad(ins_ctx->aux_index_id == fts_select_index(
+		ins_ctx->charset, word->text.f_str, word->text.f_len));
+
+	/* Pop out each fts_node in word->nodes write them to auxiliary table */
+	for (ulint i = 0; i < ib_vector_size(word->nodes); i++) {
+		dberr_t		error;
+		fts_node_t*	fts_node;
+
+		fts_node = static_cast<fts_node_t*>(ib_vector_get(word->nodes, i));
+
+		error = row_merge_write_fts_node(ins_ctx, &word->text, fts_node);
+
+		if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+			ib::error() << "Failed to write word to FTS auxiliary"
+				" index table "
+				<< ins_ctx->btr_bulk->table_name()
+				<< ", error " << error;
+			ret = error;
+		}
+
+		ut_free(fts_node->ilist);
+		fts_node->ilist = NULL;
+	}
+
+	ib_vector_reset(word->nodes);
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Read sorted FTS data files and insert data tuples to auxillary tables.
+@return DB_SUCCESS or error number */
+static
+void
+row_fts_insert_tuple(
+/*=================*/
+	fts_psort_insert_t*
+			ins_ctx,	/*!< in: insert context */
+	fts_tokenizer_word_t* word,	/*!< in: last processed
+					tokenized word */
+	ib_vector_t*	positions,	/*!< in: word position */
+	doc_id_t*	in_doc_id,	/*!< in: last item doc id */
+	dtuple_t*	dtuple)		/*!< in: entry to insert */
+{
+	fts_node_t*	fts_node = NULL;
+	dfield_t*	dfield;
+	doc_id_t	doc_id;
+	ulint		position;
+	fts_string_t	token_word;
+	ulint		i;
+
+	/* Get fts_node for the FTS auxillary INDEX table */
+	if (ib_vector_size(word->nodes) > 0) {
+		fts_node = static_cast<fts_node_t*>(
+			ib_vector_last(word->nodes));
+	}
+
+	if (fts_node == NULL
+	    || fts_node->ilist_size > FTS_ILIST_MAX_SIZE) {
+
+		fts_node = static_cast<fts_node_t*>(
+			ib_vector_push(word->nodes, NULL));
+
+		memset(fts_node, 0x0, sizeof(*fts_node));
+	}
+
+	/* If dtuple == NULL, this is the last word to be processed */
+	if (!dtuple) {
+		if (fts_node && ib_vector_size(positions) > 0) {
+			fts_cache_node_add_positions(
+				NULL, fts_node, *in_doc_id,
+				positions);
+
+			/* Write out the current word */
+			row_merge_write_fts_word(ins_ctx, word);
+		}
+
+		return;
+	}
+
+	/* Get the first field for the tokenized word */
+	dfield = dtuple_get_nth_field(dtuple, 0);
+
+	token_word.f_n_char = 0;
+	token_word.f_len = dfield->len;
+	token_word.f_str = static_cast<byte*>(dfield_get_data(dfield));
+
+	if (!word->text.f_str) {
+		fts_string_dup(&word->text, &token_word, ins_ctx->heap);
+	}
+
+	/* compare to the last word, to see if they are the same
+	word */
+	if (innobase_fts_text_cmp(ins_ctx->charset,
+				  &word->text, &token_word) != 0) {
+		ulint	num_item;
+
+		/* Getting a new word, flush the last position info
+		for the currnt word in fts_node */
+		if (ib_vector_size(positions) > 0) {
+			fts_cache_node_add_positions(
+				NULL, fts_node, *in_doc_id, positions);
+		}
+
+		/* Write out the current word */
+		row_merge_write_fts_word(ins_ctx, word);
+
+		/* Copy the new word */
+		fts_string_dup(&word->text, &token_word, ins_ctx->heap);
+
+		num_item = ib_vector_size(positions);
+
+		/* Clean up position queue */
+		for (i = 0; i < num_item; i++) {
+			ib_vector_pop(positions);
+		}
+
+		/* Reset Doc ID */
+		*in_doc_id = 0;
+		memset(fts_node, 0x0, sizeof(*fts_node));
+	}
+
+	/* Get the word's Doc ID */
+	dfield = dtuple_get_nth_field(dtuple, 1);
+
+	if (!ins_ctx->opt_doc_id_size) {
+		doc_id = fts_read_doc_id(
+			static_cast<byte*>(dfield_get_data(dfield)));
+	} else {
+		doc_id = (doc_id_t) mach_read_from_4(
+			static_cast<byte*>(dfield_get_data(dfield)));
+	}
+
+	/* Get the word's position info */
+	dfield = dtuple_get_nth_field(dtuple, 2);
+	position = mach_read_from_4(static_cast<byte*>(dfield_get_data(dfield)));
+
+	/* If this is the same word as the last word, and they
+	have the same Doc ID, we just need to add its position
+	info. Otherwise, we will flush position info to the
+	fts_node and initiate a new position vector  */
+	if (!(*in_doc_id) || *in_doc_id == doc_id) {
+		ib_vector_push(positions, &position);
+	} else {
+		ulint	num_pos = ib_vector_size(positions);
+
+		fts_cache_node_add_positions(NULL, fts_node,
+					     *in_doc_id, positions);
+		for (i = 0; i < num_pos; i++) {
+			ib_vector_pop(positions);
+		}
+		ib_vector_push(positions, &position);
+	}
+
+	/* record the current Doc ID */
+	*in_doc_id = doc_id;
+}
+
+/*********************************************************************//**
+Propagate a newly added record up one level in the selection tree
+@return parent where this value propagated to */
+static
+ulint
+row_fts_sel_tree_propagate(
+/*=======================*/
+	ulint		propogated,	/*<! in: tree node propagated */
+	int*		sel_tree,	/*<! in: selection tree */
+	const mrec_t**	mrec,		/*<! in: sort record */
+	rec_offs**	offsets,	/*<! in: record offsets */
+	dict_index_t*	index)		/*<! in/out: FTS index */
+{
+	ulint	parent;
+	int	child_left;
+	int	child_right;
+	int	selected;
+
+	/* Find which parent this value will be propagated to */
+	parent = (propogated - 1) / 2;
+
+	/* Find out which value is smaller, and to propagate */
+	child_left = sel_tree[parent * 2 + 1];
+	child_right = sel_tree[parent * 2 + 2];
+
+	if (child_left == -1 || mrec[child_left] == NULL) {
+		if (child_right == -1
+		    || mrec[child_right] == NULL) {
+			selected = -1;
+		} else {
+			selected = child_right ;
+		}
+	} else if (child_right == -1
+		   || mrec[child_right] == NULL) {
+		selected = child_left;
+	} else if (cmp_rec_rec_simple(mrec[child_left], mrec[child_right],
+				      offsets[child_left],
+				      offsets[child_right],
+				      index, NULL) < 0) {
+		selected = child_left;
+	} else {
+		selected = child_right;
+	}
+
+	sel_tree[parent] = selected;
+
+	return parent;
+}
+
+/*********************************************************************//**
+Readjust selection tree after popping the root and read a new value
+@return the new root */
+static
+int
+row_fts_sel_tree_update(
+/*====================*/
+	int*		sel_tree,	/*<! in/out: selection tree */
+	ulint		propagated,	/*<! in: node to propagate up */
+	ulint		height,		/*<! in: tree height */
+	const mrec_t**	mrec,		/*<! in: sort record */
+	rec_offs**	offsets,	/*<! in: record offsets */
+	dict_index_t*	index)		/*<! in: index dictionary */
+{
+	ulint	i;
+
+	for (i = 1; i <= height; i++) {
+		propagated = row_fts_sel_tree_propagate(
+			propagated, sel_tree, mrec, offsets, index);
+	}
+
+	return(sel_tree[0]);
+}
+
+/*********************************************************************//**
+Build selection tree at a specified level */
+static
+void
+row_fts_build_sel_tree_level(
+/*=========================*/
+	int*		sel_tree,	/*<! in/out: selection tree */
+	ulint		level,		/*<! in: selection tree level */
+	const mrec_t**	mrec,		/*<! in: sort record */
+	rec_offs**	offsets,	/*<! in: record offsets */
+	dict_index_t*	index)		/*<! in: index dictionary */
+{
+	ulint	start;
+	int	child_left;
+	int	child_right;
+	ulint	i;
+	ulint	num_item	= ulint(1) << level;
+
+	start = num_item - 1;
+
+	for (i = 0; i < num_item;  i++) {
+		child_left = sel_tree[(start + i) * 2 + 1];
+		child_right = sel_tree[(start + i) * 2 + 2];
+
+		if (child_left == -1) {
+			if (child_right == -1) {
+				sel_tree[start + i] = -1;
+			} else {
+				sel_tree[start + i] =  child_right;
+			}
+			continue;
+		} else if (child_right == -1) {
+			sel_tree[start + i] = child_left;
+			continue;
+		}
+
+		/* Deal with NULL child conditions */
+		if (!mrec[child_left]) {
+			if (!mrec[child_right]) {
+				sel_tree[start + i] = -1;
+			} else {
+				sel_tree[start + i] = child_right;
+			}
+			continue;
+		} else if (!mrec[child_right]) {
+			sel_tree[start + i] = child_left;
+			continue;
+		}
+
+		/* Select the smaller one to set parent pointer */
+		int cmp = cmp_rec_rec_simple(
+			mrec[child_left], mrec[child_right],
+			offsets[child_left], offsets[child_right],
+			index, NULL);
+
+		sel_tree[start + i] = cmp < 0 ? child_left : child_right;
+	}
+}
+
+/*********************************************************************//**
+Build a selection tree for merge. The selection tree is a binary tree
+and should have fts_sort_pll_degree / 2 levels. With root as level 0
+@return number of tree levels */
+static
+ulint
+row_fts_build_sel_tree(
+/*===================*/
+	int*		sel_tree,	/*<! in/out: selection tree */
+	const mrec_t**	mrec,		/*<! in: sort record */
+	rec_offs**	offsets,	/*<! in: record offsets */
+	dict_index_t*	index)		/*<! in: index dictionary */
+{
+	ulint	treelevel = 1;
+	ulint	num = 2;
+	ulint	i = 0;
+	ulint	start;
+
+	/* No need to build selection tree if we only have two merge threads */
+	if (fts_sort_pll_degree <= 2) {
+		return(0);
+	}
+
+	while (num < fts_sort_pll_degree) {
+		num = num << 1;
+		treelevel++;
+	}
+
+	start = (ulint(1) << treelevel) - 1;
+
+	for (i = 0; i < fts_sort_pll_degree; i++) {
+		sel_tree[i + start] = int(i);
+	}
+
+	i = treelevel;
+	do {
+		row_fts_build_sel_tree_level(
+			sel_tree, --i, mrec, offsets, index);
+	} while (i > 0);
+
+	return(treelevel);
+}
+
+/*********************************************************************//**
+Read sorted file containing index data tuples and insert these data
+tuples to the index
+@return DB_SUCCESS or error number */
+dberr_t
+row_fts_merge_insert(
+/*=================*/
+	dict_index_t*		index,	/*!< in: index */
+	dict_table_t*		table,	/*!< in: new table */
+	fts_psort_t*		psort_info, /*!< parallel sort info */
+	ulint			id)	/* !< in: which auxiliary table's data
+					to insert to */
+{
+	const byte**		b;
+	mem_heap_t*		tuple_heap;
+	mem_heap_t*		heap;
+	dberr_t			error = DB_SUCCESS;
+	ulint*			foffs;
+	rec_offs**		offsets;
+	fts_tokenizer_word_t	new_word;
+	ib_vector_t*		positions;
+	doc_id_t		last_doc_id;
+	ib_alloc_t*		heap_alloc;
+	ulint			i;
+	mrec_buf_t**		buf;
+	pfs_os_file_t*			fd;
+	byte**			block;
+	byte**			crypt_block;
+	const mrec_t**		mrec;
+	ulint			count = 0;
+	int*			sel_tree;
+	ulint			height;
+	ulint			start;
+	fts_psort_insert_t	ins_ctx;
+	uint64_t		count_diag = 0;
+	fts_table_t		fts_table;
+	char			aux_table_name[MAX_FULL_NAME_LEN];
+	dict_table_t*		aux_table;
+	dict_index_t*		aux_index;
+	trx_t*			trx;
+
+	/* We use the insert query graph as the dummy graph
+	needed in the row module call */
+
+	trx = trx_create();
+	trx_start_if_not_started(trx, true);
+
+	trx->op_info = "inserting index entries";
+
+	ins_ctx.opt_doc_id_size = psort_info[0].psort_common->opt_doc_id_size;
+
+	heap = mem_heap_create(500 + sizeof(mrec_buf_t));
+
+	b = (const byte**) mem_heap_alloc(
+		heap, sizeof (*b) * fts_sort_pll_degree);
+	foffs = (ulint*) mem_heap_alloc(
+		heap, sizeof(*foffs) * fts_sort_pll_degree);
+	offsets = (rec_offs**) mem_heap_alloc(
+		heap, sizeof(*offsets) * fts_sort_pll_degree);
+	buf = (mrec_buf_t**) mem_heap_alloc(
+		heap, sizeof(*buf) * fts_sort_pll_degree);
+	fd = (pfs_os_file_t*) mem_heap_alloc(heap, sizeof(*fd) * fts_sort_pll_degree);
+	block = (byte**) mem_heap_alloc(
+		heap, sizeof(*block) * fts_sort_pll_degree);
+	crypt_block = (byte**) mem_heap_alloc(
+		heap, sizeof(*block) * fts_sort_pll_degree);
+	mrec = (const mrec_t**) mem_heap_alloc(
+		heap, sizeof(*mrec) * fts_sort_pll_degree);
+	sel_tree = (int*) mem_heap_alloc(
+		heap, sizeof(*sel_tree) * (fts_sort_pll_degree * 2));
+
+	tuple_heap = mem_heap_create(1000);
+
+	ins_ctx.charset = fts_index_get_charset(index);
+	ins_ctx.heap = heap;
+
+	for (i = 0; i < fts_sort_pll_degree; i++) {
+		ulint	num;
+
+		num = 1 + REC_OFFS_HEADER_SIZE
+			+ dict_index_get_n_fields(index);
+		offsets[i] = static_cast<rec_offs*>(mem_heap_zalloc(
+			heap, num * sizeof *offsets[i]));
+		rec_offs_set_n_alloc(offsets[i], num);
+		rec_offs_set_n_fields(offsets[i], dict_index_get_n_fields(index));
+		block[i] = psort_info[i].merge_block[id];
+		crypt_block[i] = psort_info[i].crypt_block[id];
+		b[i] = psort_info[i].merge_block[id];
+		fd[i] = psort_info[i].merge_file[id]->fd;
+		foffs[i] = 0;
+
+		buf[i] = static_cast<mrec_buf_t*>(
+			mem_heap_alloc(heap, sizeof *buf[i]));
+
+		count_diag += psort_info[i].merge_file[id]->n_rec;
+	}
+
+	if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+		ib::info() << "InnoDB_FTS: to insert " << count_diag
+			<< " records";
+	}
+
+	/* Initialize related variables if creating FTS indexes */
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	memset(&new_word, 0, sizeof(new_word));
+
+	new_word.nodes = ib_vector_create(heap_alloc, sizeof(fts_node_t), 4);
+	positions = ib_vector_create(heap_alloc, sizeof(ulint), 32);
+	last_doc_id = 0;
+
+	/* We should set the flags2 with aux_table_name here,
+	in order to get the correct aux table names. */
+	index->table->flags2 |= DICT_TF2_FTS_AUX_HEX_NAME;
+	DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+			index->table->flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME
+			& ((1U << DICT_TF2_BITS) - 1););
+	fts_table.type = FTS_INDEX_TABLE;
+	fts_table.index_id = index->id;
+	fts_table.table_id = table->id;
+	fts_table.table = index->table;
+	fts_table.suffix = fts_get_suffix(id);
+
+	/* Get aux index */
+	fts_get_table_name(&fts_table, aux_table_name);
+	aux_table = dict_table_open_on_name(aux_table_name, false,
+					    DICT_ERR_IGNORE_NONE);
+	ut_ad(aux_table != NULL);
+	aux_index = dict_table_get_first_index(aux_table);
+
+	ut_ad(!aux_index->is_instant());
+	/* row_merge_write_fts_node() depends on the correct value */
+	ut_ad(aux_index->n_core_null_bytes
+	      == UT_BITS_IN_BYTES(aux_index->n_nullable));
+
+	/* Create bulk load instance */
+	ins_ctx.btr_bulk = UT_NEW_NOKEY(BtrBulk(aux_index, trx));
+
+	/* Create tuple for insert */
+	ins_ctx.tuple = dtuple_create(heap, dict_index_get_n_fields(aux_index));
+	dict_index_copy_types(ins_ctx.tuple, aux_index,
+			      dict_index_get_n_fields(aux_index));
+
+	/* Set TRX_ID and ROLL_PTR */
+	dfield_set_data(dtuple_get_nth_field(ins_ctx.tuple, 2),
+			&reset_trx_id, DATA_TRX_ID_LEN);
+	dfield_set_data(dtuple_get_nth_field(ins_ctx.tuple, 3),
+			&reset_trx_id[DATA_TRX_ID_LEN], DATA_ROLL_PTR_LEN);
+
+	ut_d(ins_ctx.aux_index_id = id);
+
+	const ulint space = table->space_id;
+
+	for (i = 0; i < fts_sort_pll_degree; i++) {
+		if (psort_info[i].merge_file[id]->n_rec == 0) {
+			/* No Rows to read */
+			mrec[i] = b[i] = NULL;
+		} else {
+			/* Read from temp file only if it has been
+			written to. Otherwise, block memory holds
+			all the sorted records */
+			if (psort_info[i].merge_file[id]->offset > 0
+			    && (!row_merge_read(
+					fd[i], foffs[i],
+					(row_merge_block_t*) block[i],
+					(row_merge_block_t*) crypt_block[i],
+					space))) {
+				error = DB_CORRUPTION;
+				goto exit;
+			}
+
+			ROW_MERGE_READ_GET_NEXT(i);
+		}
+	}
+
+	height = row_fts_build_sel_tree(sel_tree, (const mrec_t **) mrec,
+					offsets, index);
+
+	start = (1U << height) - 1;
+
+	/* Fetch sorted records from sort buffer and insert them into
+	corresponding FTS index auxiliary tables */
+	for (;;) {
+		dtuple_t*	dtuple;
+		int		min_rec = 0;
+
+		if (fts_sort_pll_degree <= 2) {
+			while (!mrec[min_rec]) {
+				min_rec++;
+
+				if (min_rec >= (int) fts_sort_pll_degree) {
+					row_fts_insert_tuple(
+						&ins_ctx, &new_word,
+						positions, &last_doc_id,
+						NULL);
+
+					goto exit;
+				}
+			}
+
+			for (i = min_rec + 1; i < fts_sort_pll_degree; i++) {
+				if (!mrec[i]) {
+					continue;
+				}
+
+				if (cmp_rec_rec_simple(
+					    mrec[i], mrec[min_rec],
+					    offsets[i], offsets[min_rec],
+					    index, NULL) < 0) {
+					min_rec = static_cast<int>(i);
+				}
+			}
+		} else {
+			min_rec = sel_tree[0];
+
+			if (min_rec ==  -1) {
+				row_fts_insert_tuple(
+					&ins_ctx, &new_word,
+					positions, &last_doc_id,
+					NULL);
+
+				goto exit;
+			}
+		}
+
+		dtuple = row_rec_to_index_entry_low(
+			mrec[min_rec], index, offsets[min_rec],
+			tuple_heap);
+
+		row_fts_insert_tuple(
+			&ins_ctx, &new_word, positions,
+			&last_doc_id, dtuple);
+
+
+		ROW_MERGE_READ_GET_NEXT(min_rec);
+
+		if (fts_sort_pll_degree > 2) {
+			if (!mrec[min_rec]) {
+				sel_tree[start + min_rec] = -1;
+			}
+
+			row_fts_sel_tree_update(sel_tree, start + min_rec,
+						height, mrec,
+						offsets, index);
+		}
+
+		count++;
+
+		mem_heap_empty(tuple_heap);
+	}
+
+exit:
+	fts_sql_commit(trx);
+
+	trx->op_info = "";
+
+	mem_heap_free(tuple_heap);
+
+	error = ins_ctx.btr_bulk->finish(error);
+	UT_DELETE(ins_ctx.btr_bulk);
+
+	aux_table->release();
+
+	trx->free();
+
+	mem_heap_free(heap);
+
+	if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+		ib::info() << "InnoDB_FTS: inserted " << count << " records";
+	}
+
+	return(error);
+}
diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc
new file mode 100644
index 00000000..d2609fdb
--- /dev/null
+++ b/storage/innobase/row/row0import.cc
@@ -0,0 +1,4585 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0import.cc
+Import a tablespace to a running instance.
+
+Created 2012-02-08 by Sunny Bains.
+*******************************************************/
+
+#include "row0import.h"
+#include "btr0pcur.h"
+#ifdef BTR_CUR_HASH_ADAPT
+# include "btr0sea.h"
+#endif
+#include "buf0flu.h"
+#include "que0que.h"
+#include "dict0boot.h"
+#include "dict0load.h"
+#include "pars0pars.h"
+#include "row0row.h"
+#include "row0sel.h"
+#include "row0mysql.h"
+#include "srv0start.h"
+#include "row0quiesce.h"
+#include "fil0pagecompress.h"
+#include "trx0undo.h"
+#include "lock0lock.h"
+#include "lzo/lzo1x.h"
+#include "snappy-c.h"
+#include "log.h"
+
+#include "scope.h"
+
+#include <vector>
+
+#ifdef HAVE_MY_AES_H
+#include <my_aes.h>
+#endif
+
+using st_::span;
+
+/** The size of the buffer to use for IO.
+@param n physical page size
+@return number of pages */
+#define IO_BUFFER_SIZE(n)	((1024 * 1024) / (n))
+
+/** For gathering stats on records during phase I */
+struct row_stats_t {
+	ulint		m_n_deleted;		/*!< Number of deleted records
+						found in the index */
+
+	ulint		m_n_purged;		/*!< Number of records purged
+						optimisatically */
+
+	ulint		m_n_rows;		/*!< Number of rows */
+
+	ulint		m_n_purge_failed;	/*!< Number of deleted rows
+						that could not be purged */
+};
+
+/** Index information required by IMPORT. */
+struct row_index_t {
+	index_id_t	m_id;			/*!< Index id of the table
+						in the exporting server */
+	byte*		m_name;			/*!< Index name */
+
+	uint32_t	m_space;		/*!< Space where it is placed */
+
+	uint32_t	m_page_no;		/*!< Root page number */
+
+	ulint		m_type;			/*!< Index type */
+
+	ulint		m_trx_id_offset;	/*!< Relevant only for clustered
+						indexes, offset of transaction
+						id system column */
+
+	ulint		m_n_user_defined_cols;	/*!< User defined columns */
+
+	ulint		m_n_uniq;		/*!< Number of columns that can
+						uniquely identify the row */
+
+	ulint		m_n_nullable;		/*!< Number of nullable
+						columns */
+
+	ulint		m_n_fields;		/*!< Total number of fields */
+
+	dict_field_t*	m_fields;		/*!< Index fields */
+
+	const dict_index_t*
+			m_srv_index;		/*!< Index instance in the
+						importing server */
+
+	row_stats_t	m_stats;		/*!< Statistics gathered during
+						the import phase */
+
+};
+
+/** Meta data required by IMPORT. */
+struct row_import {
+	row_import() UNIV_NOTHROW
+		:
+		m_table(NULL),
+		m_version(0),
+		m_hostname(NULL),
+		m_table_name(NULL),
+		m_autoinc(0),
+		m_zip_size(0),
+		m_flags(0),
+		m_n_cols(0),
+		m_cols(NULL),
+		m_col_names(NULL),
+		m_n_indexes(0),
+		m_indexes(NULL),
+		m_missing(true) { }
+
+	~row_import() UNIV_NOTHROW;
+
+	/** Find the index entry in in the indexes array.
+	@param name index name
+	@return instance if found else 0. */
+	row_index_t* get_index(const char* name) const UNIV_NOTHROW;
+
+	/** Get the number of rows in the index.
+	@param name index name
+	@return number of rows (doesn't include delete marked rows). */
+	ulint	get_n_rows(const char* name) const UNIV_NOTHROW;
+
+	/** Find the ordinal value of the column name in the cfg table columns.
+	@param name of column to look for.
+	@return ULINT_UNDEFINED if not found. */
+	ulint find_col(const char* name) const UNIV_NOTHROW;
+
+	/** Get the number of rows for which purge failed during the
+	convert phase.
+	@param name index name
+	@return number of rows for which purge failed. */
+	ulint get_n_purge_failed(const char* name) const UNIV_NOTHROW;
+
+	/** Check if the index is clean. ie. no delete-marked records
+	@param name index name
+	@return true if index needs to be purged. */
+	bool requires_purge(const char* name) const UNIV_NOTHROW
+	{
+		return(get_n_purge_failed(name) > 0);
+	}
+
+	/** Set the index root <space, pageno> using the index name */
+	void set_root_by_name() UNIV_NOTHROW;
+
+	/** Set the index root <space, pageno> using a heuristic
+	@return DB_SUCCESS or error code */
+	dberr_t set_root_by_heuristic() UNIV_NOTHROW;
+
+	/** Check if the index schema that was read from the .cfg file
+	matches the in memory index definition.
+	Note: It will update row_import_t::m_srv_index to map the meta-data
+	read from the .cfg file to the server index instance.
+	@return DB_SUCCESS or error code. */
+	dberr_t match_index_columns(
+		THD*			thd,
+		const dict_index_t*	index) UNIV_NOTHROW;
+
+	/** Check if the table schema that was read from the .cfg file
+	matches the in memory table definition.
+	@param thd MySQL session variable
+	@return DB_SUCCESS or error code. */
+	dberr_t match_table_columns(
+		THD*			thd) UNIV_NOTHROW;
+
+	/** Check if the table (and index) schema that was read from the
+	.cfg file matches the in memory table definition.
+	@param thd MySQL session variable
+	@return DB_SUCCESS or error code. */
+	dberr_t match_schema(
+		THD*			thd) UNIV_NOTHROW;
+
+	dberr_t match_flags(THD *thd) const ;
+
+
+	dict_table_t*	m_table;		/*!< Table instance */
+
+	ulint		m_version;		/*!< Version of config file */
+
+	byte*		m_hostname;		/*!< Hostname where the
+						tablespace was exported */
+	byte*		m_table_name;		/*!< Exporting instance table
+						name */
+
+	ib_uint64_t	m_autoinc;		/*!< Next autoinc value */
+
+	ulint		m_zip_size;		/*!< ROW_FORMAT=COMPRESSED
+						page size, or 0 */
+
+	ulint		m_flags;		/*!< Table flags */
+
+	ulint		m_n_cols;		/*!< Number of columns in the
+						meta-data file */
+
+	dict_col_t*	m_cols;			/*!< Column data */
+
+	byte**		m_col_names;		/*!< Column names, we store the
+						column naems separately becuase
+						there is no field to store the
+						value in dict_col_t */
+
+	ulint		m_n_indexes;		/*!< Number of indexes,
+						including clustered index */
+
+	row_index_t*	m_indexes;		/*!< Index meta data */
+
+	bool		m_missing;		/*!< true if a .cfg file was
+						found and was readable */
+};
+
+struct fil_iterator_t {
+	pfs_os_file_t	file;			/*!< File handle */
+	const char*	filepath;		/*!< File path name */
+	os_offset_t	start;			/*!< From where to start */
+	os_offset_t	end;			/*!< Where to stop */
+	os_offset_t	file_size;		/*!< File size in bytes */
+	ulint		n_io_buffers;		/*!< Number of pages to use
+						for IO */
+	byte*		io_buffer;		/*!< Buffer to use for IO */
+	fil_space_crypt_t *crypt_data;		/*!< Crypt data (if encrypted) */
+	byte*           crypt_io_buffer;        /*!< IO buffer when encrypted */
+};
+
+/** Use the page cursor to iterate over records in a block. */
+class RecIterator {
+public:
+	/** Default constructor */
+	RecIterator() UNIV_NOTHROW
+	{
+		memset(&m_cur, 0x0, sizeof(m_cur));
+		/* Make page_cur_delete_rec() happy. */
+		m_mtr.start();
+		m_mtr.set_log_mode(MTR_LOG_NO_REDO);
+	}
+
+	/** Position the cursor on the first user record. */
+	rec_t* open(buf_block_t* block, const dict_index_t* index) noexcept
+		MY_ATTRIBUTE((warn_unused_result))
+	{
+		m_cur.index = const_cast<dict_index_t*>(index);
+		page_cur_set_before_first(block, &m_cur);
+		return next();
+	}
+
+	/** Move to the next record. */
+	rec_t* next() noexcept MY_ATTRIBUTE((warn_unused_result))
+	{
+		return page_cur_move_to_next(&m_cur);
+	}
+
+	/**
+	@return the current record */
+	rec_t*	current() UNIV_NOTHROW
+	{
+		ut_ad(!end());
+		return(page_cur_get_rec(&m_cur));
+	}
+
+	buf_block_t* current_block() const { return m_cur.block; }
+
+	/**
+	@return true if cursor is at the end */
+	bool	end() UNIV_NOTHROW
+	{
+		return(page_cur_is_after_last(&m_cur) == TRUE);
+	}
+
+	/** Remove the current record
+	@return true on success */
+	bool remove(rec_offs* offsets) UNIV_NOTHROW
+	{
+		const dict_index_t* const index = m_cur.index;
+		ut_ad(page_is_leaf(m_cur.block->page.frame));
+		/* We can't end up with an empty page unless it is root. */
+		if (page_get_n_recs(m_cur.block->page.frame) <= 1) {
+			return(false);
+		}
+
+		if (!rec_offs_any_extern(offsets)
+		    && m_cur.block->page.id().page_no() != index->page
+		    && ((page_get_data_size(m_cur.block->page.frame)
+			 - rec_offs_size(offsets)
+			 < BTR_CUR_PAGE_COMPRESS_LIMIT(index))
+			|| !page_has_siblings(m_cur.block->page.frame)
+			|| (page_get_n_recs(m_cur.block->page.frame) < 2))) {
+			return false;
+		}
+
+#ifdef UNIV_ZIP_DEBUG
+		page_zip_des_t* page_zip = buf_block_get_page_zip(m_cur.block);
+		ut_a(!page_zip || page_zip_validate(
+			     page_zip, m_cur.block->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+		page_cur_delete_rec(&m_cur, offsets, &m_mtr);
+
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(
+			     page_zip, m_cur.block->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+		return true;
+	}
+
+private:
+	page_cur_t	m_cur;
+public:
+	mtr_t		m_mtr;
+};
+
+/** Class that purges delete marked records from indexes, both secondary
+and cluster. It does a pessimistic delete. This should only be done if we
+couldn't purge the delete marked reocrds during Phase I. */
+class IndexPurge {
+public:
+	/** Constructor
+	@param trx the user transaction covering the import tablespace
+	@param index to be imported
+	@param space_id space id of the tablespace */
+	IndexPurge(
+		trx_t*		trx,
+		dict_index_t*	index) UNIV_NOTHROW
+		:
+		m_trx(trx),
+		m_index(index),
+		m_n_rows(0)
+	{
+		ib::info() << "Phase II - Purge records from index "
+			<< index->name;
+	}
+
+	/** Destructor */
+	~IndexPurge() UNIV_NOTHROW = default;
+
+	/** Purge delete marked records.
+	@return DB_SUCCESS or error code. */
+	dberr_t	garbage_collect() UNIV_NOTHROW;
+
+	/** The number of records that are not delete marked.
+	@return total records in the index after purge */
+	ulint	get_n_rows() const UNIV_NOTHROW
+	{
+		return(m_n_rows);
+	}
+
+private:
+  /** Begin import, position the cursor on the first record. */
+  inline bool open() noexcept;
+
+  /** Close the persistent cursor and commit the mini-transaction. */
+  void close() noexcept { m_mtr.commit(); btr_pcur_close(&m_pcur); }
+
+  /** Position the cursor on the next record.
+  @return DB_SUCCESS or error code */
+  dberr_t next() noexcept;
+
+  /** Store the persistent cursor position and reopen the
+  B-tree cursor in BTR_MODIFY_TREE mode, because the
+  tree structure may be changed during a pessimistic delete. */
+  inline dberr_t purge_pessimistic_delete() noexcept;
+
+  /** Purge a delete-marked record. */
+  dberr_t purge() noexcept;
+
+protected:
+	// Disable copying
+	IndexPurge();
+	IndexPurge(const IndexPurge&);
+	IndexPurge &operator=(const IndexPurge&);
+
+private:
+	trx_t*			m_trx;		/*!< User transaction */
+	mtr_t			m_mtr;		/*!< Mini-transaction */
+	btr_pcur_t		m_pcur;		/*!< Persistent cursor */
+	dict_index_t*		m_index;	/*!< Index to be processed */
+	ulint			m_n_rows;	/*!< Records in index */
+};
+
+/** Functor that is called for each physical page that is read from the
+tablespace file.  */
+class AbstractCallback
+{
+public:
+	/** Constructor
+	@param trx covering transaction */
+	AbstractCallback(trx_t* trx, uint32_t space_id)
+		:
+		m_zip_size(0),
+		m_trx(trx),
+		m_space(space_id),
+		m_xdes(),
+		m_xdes_page_no(UINT32_MAX),
+		m_space_flags(UINT32_MAX) UNIV_NOTHROW { }
+
+	/** Free any extent descriptor instance */
+	virtual ~AbstractCallback()
+	{
+		UT_DELETE_ARRAY(m_xdes);
+	}
+
+	/** Determine the page size to use for traversing the tablespace
+	@param file_size size of the tablespace file in bytes
+	@param block contents of the first page in the tablespace file.
+	@retval DB_SUCCESS or error code. */
+	virtual dberr_t init(
+		os_offset_t		file_size,
+		const buf_block_t*	block) UNIV_NOTHROW;
+
+	/** @return true if compressed table. */
+	bool is_compressed_table() const UNIV_NOTHROW
+	{
+		return get_zip_size();
+	}
+
+	/** @return the tablespace flags */
+	uint32_t get_space_flags() const { return m_space_flags; }
+
+	/**
+	Set the name of the physical file and the file handle that is used
+	to open it for the file that is being iterated over.
+	@param filename the physical name of the tablespace file
+	@param file OS file handle */
+	void set_file(const char* filename, pfs_os_file_t file) UNIV_NOTHROW
+	{
+		m_file = file;
+		m_filepath = filename;
+	}
+
+	ulint get_zip_size() const { return m_zip_size; }
+	ulint physical_size() const
+	{
+		return m_zip_size ? m_zip_size : srv_page_size;
+	}
+
+	const char* filename() const { return m_filepath; }
+
+	/**
+	Called for every page in the tablespace. If the page was not
+	updated then its state must be set to BUF_PAGE_NOT_USED. For
+	compressed tables the page descriptor memory will be at offset:
+		block->page.frame + srv_page_size;
+	@param block block read from file, note it is not from the buffer pool
+	@retval DB_SUCCESS or error code. */
+	virtual dberr_t operator()(buf_block_t* block) UNIV_NOTHROW = 0;
+
+	/** @return the tablespace identifier */
+	uint32_t get_space_id() const { return m_space; }
+
+	bool is_interrupted() const { return trx_is_interrupted(m_trx); }
+
+	/**
+	Get the data page depending on the table type, compressed or not.
+	@param block - block read from disk
+	@retval the buffer frame */
+	static byte* get_frame(const buf_block_t* block)
+	{
+		return block->page.zip.data
+			? block->page.zip.data : block->page.frame;
+	}
+
+	/** Invoke the functionality for the callback */
+	virtual dberr_t run(const fil_iterator_t& iter,
+			    buf_block_t* block) UNIV_NOTHROW = 0;
+
+protected:
+	/** Get the physical offset of the extent descriptor within the page.
+	@param page_no page number of the extent descriptor
+	@param page contents of the page containing the extent descriptor.
+	@return the start of the xdes array in a page */
+	const xdes_t* xdes(
+		ulint		page_no,
+		const page_t*	page) const UNIV_NOTHROW
+	{
+		ulint	offset;
+
+		offset = xdes_calc_descriptor_index(get_zip_size(), page_no);
+
+		return(page + XDES_ARR_OFFSET + XDES_SIZE * offset);
+	}
+
+	/** Set the current page directory (xdes). If the extent descriptor is
+	marked as free then free the current extent descriptor and set it to
+	0. This implies that all pages that are covered by this extent
+	descriptor are also freed.
+
+	@param page_no offset of page within the file
+	@param page page contents
+	@return DB_SUCCESS or error code. */
+	dberr_t	set_current_xdes(
+		uint32_t	page_no,
+		const page_t*	page) UNIV_NOTHROW
+	{
+		m_xdes_page_no = page_no;
+
+		UT_DELETE_ARRAY(m_xdes);
+		m_xdes = NULL;
+
+		if (mach_read_from_4(XDES_ARR_OFFSET + XDES_STATE + page)
+		    != XDES_FREE) {
+			const ulint physical_size = m_zip_size
+				? m_zip_size : srv_page_size;
+
+			m_xdes = UT_NEW_ARRAY_NOKEY(xdes_t, physical_size);
+
+			/* Trigger OOM */
+			DBUG_EXECUTE_IF(
+				"ib_import_OOM_13",
+				UT_DELETE_ARRAY(m_xdes);
+				m_xdes = NULL;
+			);
+
+			if (m_xdes == NULL) {
+				return(DB_OUT_OF_MEMORY);
+			}
+
+			memcpy(m_xdes, page, physical_size);
+		}
+
+		return(DB_SUCCESS);
+	}
+
+	/** Check if the page is marked as free in the extent descriptor.
+	@param page_no page number to check in the extent descriptor.
+	@return true if the page is marked as free */
+	bool is_free(uint32_t page_no) const UNIV_NOTHROW
+	{
+		ut_a(xdes_calc_descriptor_page(get_zip_size(), page_no)
+		     == m_xdes_page_no);
+
+		if (m_xdes != 0) {
+			const xdes_t*	xdesc = xdes(page_no, m_xdes);
+			ulint		pos = page_no % FSP_EXTENT_SIZE;
+
+			return xdes_is_free(xdesc, pos);
+		}
+
+		/* If the current xdes was free, the page must be free. */
+		return(true);
+	}
+
+protected:
+	/** The ROW_FORMAT=COMPRESSED page size, or 0. */
+	ulint			m_zip_size;
+
+	/** File handle to the tablespace */
+	pfs_os_file_t		m_file;
+
+	/** Physical file path. */
+	const char*		m_filepath;
+
+	/** Covering transaction. */
+	trx_t*			m_trx;
+
+	/** Space id of the file being iterated over. */
+	uint32_t		m_space;
+
+	/** Current extent descriptor page */
+	xdes_t*			m_xdes;
+
+	/** Physical page offset in the file of the extent descriptor */
+	uint32_t		m_xdes_page_no;
+
+	/** Flags value read from the header page */
+	uint32_t		m_space_flags;
+};
+
+ATTRIBUTE_COLD static dberr_t invalid_space_flags(uint32_t flags)
+{
+  if (fsp_flags_is_incompatible_mysql(flags))
+  {
+    sql_print_error("InnoDB: unsupported MySQL tablespace");
+    return DB_UNSUPPORTED;
+  }
+
+  sql_print_error("InnoDB: Invalid FSP_SPACE_FLAGS=0x%" PRIx32, flags);
+  return DB_CORRUPTION;
+}
+
+/** Determine the page size to use for traversing the tablespace
+@param file_size size of the tablespace file in bytes
+@param block contents of the first page in the tablespace file.
+@retval DB_SUCCESS or error code. */
+dberr_t
+AbstractCallback::init(
+	os_offset_t		file_size,
+	const buf_block_t*	block) UNIV_NOTHROW
+{
+	const page_t*		page = block->page.frame;
+
+	m_space_flags = fsp_header_get_flags(page);
+	if (!fil_space_t::is_valid_flags(m_space_flags, true)) {
+		uint32_t cflags = fsp_flags_convert_from_101(m_space_flags);
+		if (cflags == UINT32_MAX) {
+			return DB_CORRUPTION;
+		}
+		m_space_flags = cflags;
+	}
+
+	/* Clear the DATA_DIR flag, which is basically garbage. */
+	m_space_flags &= ~(1U << FSP_FLAGS_POS_RESERVED);
+	m_zip_size = fil_space_t::zip_size(m_space_flags);
+	const ulint logical_size = fil_space_t::logical_size(m_space_flags);
+	const ulint physical_size = fil_space_t::physical_size(m_space_flags);
+
+	if (logical_size != srv_page_size) {
+
+		ib::error() << "Page size " << logical_size
+			<< " of ibd file is not the same as the server page"
+			" size " << srv_page_size;
+
+		return(DB_CORRUPTION);
+
+	} else if (file_size & (physical_size - 1)) {
+
+		ib::error() << "File size " << file_size << " is not a"
+			" multiple of the page size "
+			<< physical_size;
+
+		return(DB_CORRUPTION);
+	}
+
+	if (m_space == UINT32_MAX) {
+		m_space = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_ID
+					   + page);
+	}
+
+	return set_current_xdes(0, page);
+}
+
+/**
+TODO: This can be made parallel trivially by chunking up the file
+and creating a callback per thread.. Main benefit will be to use
+multiple CPUs for checksums and compressed tables. We have to do
+compressed tables block by block right now. Secondly we need to
+decompress/compress and copy too much of data. These are
+CPU intensive.
+
+Iterate over all the pages in the tablespace.
+@param iter - Tablespace iterator
+@param block - block to use for IO
+@param callback - Callback to inspect and update page contents
+@retval DB_SUCCESS or error code */
+static dberr_t fil_iterate(
+	const fil_iterator_t&	iter,
+	buf_block_t*		block,
+	AbstractCallback&	callback);
+
+/**
+Try and determine the index root pages by checking if the next/prev
+pointers are both FIL_NULL. We need to ensure that skip deleted pages. */
+struct FetchIndexRootPages : public AbstractCallback {
+
+	/** Index information gathered from the .ibd file. */
+	struct Index {
+
+		Index(index_id_t id, uint32_t page_no)
+			:
+			m_id(id),
+			m_page_no(page_no) { }
+
+		index_id_t	m_id;		/*!< Index id */
+		uint32_t	m_page_no;	/*!< Root page number */
+	};
+
+	/** Constructor
+	@param trx covering (user) transaction
+	@param table table definition in server .*/
+	FetchIndexRootPages(const dict_table_t* table, trx_t* trx)
+		:
+		AbstractCallback(trx, UINT32_MAX),
+		m_table(table), m_index(0, 0) UNIV_NOTHROW { }
+
+	/** Destructor */
+	~FetchIndexRootPages() UNIV_NOTHROW override = default;
+
+	/** Fetch the clustered index root page in the tablespace
+	@param iter	Tablespace iterator
+	@param block	Block to use for IO
+	@retval DB_SUCCESS or error code */
+	dberr_t run(const fil_iterator_t& iter,
+		    buf_block_t* block) UNIV_NOTHROW override;
+
+	/** Called for each block as it is read from the file.
+	@param block block to convert, it is not from the buffer pool.
+	@retval DB_SUCCESS or error code. */
+	dberr_t operator()(buf_block_t* block) UNIV_NOTHROW override;
+
+	/** Update the import configuration that will be used to import
+	the tablespace. */
+	dberr_t build_row_import(row_import* cfg) const UNIV_NOTHROW;
+
+	/** Table definition in server. */
+	const dict_table_t*	m_table;
+
+	/** Index information */
+	Index			m_index;
+};
+
+/** Called for each block as it is read from the file. Check index pages to
+determine the exact row format. We can't get that from the tablespace
+header flags alone.
+
+@param block block to convert, it is not from the buffer pool.
+@retval DB_SUCCESS or error code. */
+dberr_t FetchIndexRootPages::operator()(buf_block_t* block) UNIV_NOTHROW
+{
+	if (is_interrupted()) return DB_INTERRUPTED;
+
+	const page_t*	page = get_frame(block);
+
+	m_index.m_id = btr_page_get_index_id(page);
+	m_index.m_page_no = block->page.id().page_no();
+
+	/* Check that the tablespace flags match the table flags. */
+	const uint32_t expected = dict_tf_to_fsp_flags(m_table->flags);
+	if (!fsp_flags_match(expected, m_space_flags)) {
+		ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLE_SCHEMA_MISMATCH,
+			"Expected FSP_SPACE_FLAGS=0x%x, .ibd "
+			"file contains 0x%x.",
+			unsigned(expected),
+			unsigned(m_space_flags));
+		return(DB_CORRUPTION);
+	}
+
+	if (!page_is_comp(block->page.frame) !=
+	    !dict_table_is_comp(m_table)) {
+		ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLE_SCHEMA_MISMATCH,
+			"ROW_FORMAT mismatch");
+		return DB_CORRUPTION;
+	}
+
+	return DB_SUCCESS;
+}
+
+/**
+Update the import configuration that will be used to import the tablespace.
+@return error code or DB_SUCCESS */
+dberr_t
+FetchIndexRootPages::build_row_import(row_import* cfg) const UNIV_NOTHROW
+{
+	ut_a(cfg->m_table == m_table);
+	cfg->m_zip_size = m_zip_size;
+	cfg->m_n_indexes = 1;
+
+	if (cfg->m_n_indexes == 0) {
+
+		ib::error() << "No B+Tree found in tablespace";
+
+		return(DB_CORRUPTION);
+	}
+
+	cfg->m_indexes = UT_NEW_ARRAY_NOKEY(row_index_t, cfg->m_n_indexes);
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF(
+		"ib_import_OOM_11",
+		UT_DELETE_ARRAY(cfg->m_indexes);
+		cfg->m_indexes = NULL;
+	);
+
+	if (cfg->m_indexes == NULL) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	memset(cfg->m_indexes, 0x0, sizeof(*cfg->m_indexes) * cfg->m_n_indexes);
+
+	row_index_t*	cfg_index = cfg->m_indexes;
+
+	char	name[BUFSIZ];
+
+	snprintf(name, sizeof(name), "index" IB_ID_FMT, m_index.m_id);
+
+	ulint	len = strlen(name) + 1;
+
+	cfg_index->m_name = UT_NEW_ARRAY_NOKEY(byte, len);
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF(
+		"ib_import_OOM_12",
+		UT_DELETE_ARRAY(cfg_index->m_name);
+		cfg_index->m_name = NULL;
+	);
+
+	if (cfg_index->m_name == NULL) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	memcpy(cfg_index->m_name, name, len);
+
+	cfg_index->m_id = m_index.m_id;
+
+	cfg_index->m_space = m_space;
+
+	cfg_index->m_page_no = m_index.m_page_no;
+
+	return(DB_SUCCESS);
+}
+
+/* Functor that is called for each physical page that is read from the
+tablespace file.
+
+  1. Check each page for corruption.
+
+  2. Update the space id and LSN on every page
+     * For the header page
+       - Validate the flags
+       - Update the LSN
+
+  3. On Btree pages
+     * Set the index id
+     * Update the max trx id
+     * In a cluster index, update the system columns
+     * In a cluster index, update the BLOB ptr, set the space id
+     * Purge delete marked records, but only if they can be easily
+       removed from the page
+     * Keep a counter of number of rows, ie. non-delete-marked rows
+     * Keep a counter of number of delete marked rows
+     * Keep a counter of number of purge failure
+     * If a page is stamped with an index id that isn't in the .cfg file
+       we assume it is deleted and the page can be ignored.
+
+   4. Set the page state to dirty so that it will be written to disk.
+*/
+class PageConverter : public AbstractCallback {
+public:
+	/** Constructor
+	@param cfg config of table being imported.
+	@param space_id tablespace identifier
+	@param trx transaction covering the import */
+	PageConverter(row_import* cfg, uint32_t space_id, trx_t* trx)
+		:
+		AbstractCallback(trx, space_id),
+		m_cfg(cfg),
+		m_index(cfg->m_indexes),
+		m_rec_iter(),
+		m_offsets_(), m_offsets(m_offsets_),
+		m_heap(0),
+		m_cluster_index(dict_table_get_first_index(cfg->m_table))
+	{
+		rec_offs_init(m_offsets_);
+	}
+
+	~PageConverter() UNIV_NOTHROW override
+	{
+		if (m_heap != 0) {
+			mem_heap_free(m_heap);
+		}
+	}
+
+	dberr_t run(const fil_iterator_t& iter,
+		    buf_block_t* block) UNIV_NOTHROW override
+	{
+		return fil_iterate(iter, block, *this);
+	}
+
+	/** Called for each block as it is read from the file.
+	@param block block to convert, it is not from the buffer pool.
+	@retval DB_SUCCESS or error code. */
+	dberr_t operator()(buf_block_t* block) UNIV_NOTHROW override;
+
+private:
+	/** Update the page, set the space id, max trx id and index id.
+	@param block block read from file
+	@param page_type type of the page
+	@retval DB_SUCCESS or error code */
+	dberr_t update_page(buf_block_t* block, uint16_t& page_type)
+		UNIV_NOTHROW;
+
+	/** Update the space, index id, trx id.
+	@param block block to convert
+	@return DB_SUCCESS or error code */
+	dberr_t	update_index_page(buf_block_t*	block) UNIV_NOTHROW;
+
+	/** Update the BLOB refrences and write UNDO log entries for
+	rows that can't be purged optimistically.
+	@param block block to update
+	@retval DB_SUCCESS or error code */
+	dberr_t	update_records(buf_block_t* block) UNIV_NOTHROW;
+
+	/** Validate the space flags and update tablespace header page.
+	@param block block read from file, not from the buffer pool.
+	@retval DB_SUCCESS or error code */
+	dberr_t	update_header(buf_block_t* block) UNIV_NOTHROW;
+
+	/** Adjust the BLOB reference for a single column that is externally stored
+	@param rec record to update
+	@param offsets column offsets for the record
+	@param i column ordinal value
+	@return DB_SUCCESS or error code */
+	dberr_t	adjust_cluster_index_blob_column(
+		rec_t*		rec,
+		const rec_offs*	offsets,
+		ulint		i) UNIV_NOTHROW;
+
+	/** Adjusts the BLOB reference in the clustered index row for all
+	externally stored columns.
+	@param rec record to update
+	@param offsets column offsets for the record
+	@return DB_SUCCESS or error code */
+	dberr_t	adjust_cluster_index_blob_columns(
+		rec_t*		rec,
+		const rec_offs*	offsets) UNIV_NOTHROW;
+
+	/** In the clustered index, adjist the BLOB pointers as needed.
+	Also update the BLOB reference, write the new space id.
+	@param rec record to update
+	@param offsets column offsets for the record
+	@return DB_SUCCESS or error code */
+	dberr_t	adjust_cluster_index_blob_ref(
+		rec_t*		rec,
+		const rec_offs*	offsets) UNIV_NOTHROW;
+
+	/** Purge delete-marked records, only if it is possible to do
+	so without re-organising the B+tree.
+	@retval true if purged */
+	bool purge() UNIV_NOTHROW;
+
+	/** Adjust the BLOB references and sys fields for the current record.
+	@param rec record to update
+	@param offsets column offsets for the record
+	@return DB_SUCCESS or error code. */
+	dberr_t	adjust_cluster_record(
+		rec_t*			rec,
+		const rec_offs*		offsets) UNIV_NOTHROW;
+
+	/** Find an index with the matching id.
+	@return row_index_t* instance or 0 */
+	row_index_t* find_index(index_id_t id) UNIV_NOTHROW
+	{
+		row_index_t*	index = &m_cfg->m_indexes[0];
+
+		for (ulint i = 0; i < m_cfg->m_n_indexes; ++i, ++index) {
+			if (id == index->m_id) {
+				return(index);
+			}
+		}
+
+		return(0);
+
+	}
+private:
+	/** Config for table that is being imported. */
+	row_import*		m_cfg;
+
+	/** Current index whose pages are being imported */
+	row_index_t*		m_index;
+
+	/** Iterator over records in a block */
+	RecIterator		m_rec_iter;
+
+	/** Record offset */
+	rec_offs		m_offsets_[REC_OFFS_NORMAL_SIZE];
+
+	/** Pointer to m_offsets_ */
+	rec_offs*		m_offsets;
+
+	/** Memory heap for the record offsets */
+	mem_heap_t*		m_heap;
+
+	/** Cluster index instance */
+	dict_index_t*		m_cluster_index;
+};
+
+/**
+row_import destructor. */
+row_import::~row_import() UNIV_NOTHROW
+{
+	for (ulint i = 0; m_indexes != 0 && i < m_n_indexes; ++i) {
+		UT_DELETE_ARRAY(m_indexes[i].m_name);
+
+		if (m_indexes[i].m_fields == NULL) {
+			continue;
+		}
+
+		dict_field_t*	fields = m_indexes[i].m_fields;
+		ulint		n_fields = m_indexes[i].m_n_fields;
+
+		for (ulint j = 0; j < n_fields; ++j) {
+			UT_DELETE_ARRAY(const_cast<char*>(fields[j].name()));
+		}
+
+		UT_DELETE_ARRAY(fields);
+	}
+
+	for (ulint i = 0; m_col_names != 0 && i < m_n_cols; ++i) {
+		UT_DELETE_ARRAY(m_col_names[i]);
+	}
+
+	UT_DELETE_ARRAY(m_cols);
+	UT_DELETE_ARRAY(m_indexes);
+	UT_DELETE_ARRAY(m_col_names);
+	UT_DELETE_ARRAY(m_table_name);
+	UT_DELETE_ARRAY(m_hostname);
+}
+
+/** Find the index entry in in the indexes array.
+@param name index name
+@return instance if found else 0. */
+row_index_t*
+row_import::get_index(
+	const char*	name) const UNIV_NOTHROW
+{
+	for (ulint i = 0; i < m_n_indexes; ++i) {
+		const char*	index_name;
+		row_index_t*	index = &m_indexes[i];
+
+		index_name = reinterpret_cast<const char*>(index->m_name);
+
+		if (strcmp(index_name, name) == 0) {
+
+			return(index);
+		}
+	}
+
+	return(0);
+}
+
+/** Get the number of rows in the index.
+@param name index name
+@return number of rows (doesn't include delete marked rows). */
+ulint
+row_import::get_n_rows(
+	const char*	name) const UNIV_NOTHROW
+{
+	const row_index_t*	index = get_index(name);
+
+	ut_a(name != 0);
+
+	return(index->m_stats.m_n_rows);
+}
+
+/** Get the number of rows for which purge failed uding the convert phase.
+@param name index name
+@return number of rows for which purge failed. */
+ulint
+row_import::get_n_purge_failed(
+	const char*	name) const UNIV_NOTHROW
+{
+	const row_index_t*	index = get_index(name);
+
+	ut_a(name != 0);
+
+	return(index->m_stats.m_n_purge_failed);
+}
+
+/** Find the ordinal value of the column name in the cfg table columns.
+@param name of column to look for.
+@return ULINT_UNDEFINED if not found. */
+ulint
+row_import::find_col(
+	const char*	name) const UNIV_NOTHROW
+{
+	for (ulint i = 0; i < m_n_cols; ++i) {
+		const char*	col_name;
+
+		col_name = reinterpret_cast<const char*>(m_col_names[i]);
+
+		if (strcmp(col_name, name) == 0) {
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**
+Check if the index schema that was read from the .cfg file matches the
+in memory index definition.
+@return DB_SUCCESS or error code. */
+dberr_t
+row_import::match_index_columns(
+	THD*			thd,
+	const dict_index_t*	index) UNIV_NOTHROW
+{
+	row_index_t*		cfg_index;
+	dberr_t			err = DB_SUCCESS;
+
+	cfg_index = get_index(index->name);
+
+	if (cfg_index == 0) {
+		ib_errf(thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLE_SCHEMA_MISMATCH,
+			"Index %s not found in tablespace meta-data file.",
+			index->name());
+
+		return(DB_ERROR);
+	}
+
+	if (cfg_index->m_n_fields != index->n_fields) {
+
+		ib_errf(thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLE_SCHEMA_MISMATCH,
+			"Index field count %u doesn't match"
+			" tablespace metadata file value " ULINTPF,
+			index->n_fields, cfg_index->m_n_fields);
+
+		return(DB_ERROR);
+	}
+
+	cfg_index->m_srv_index = index;
+
+	const dict_field_t*	field = index->fields;
+	const dict_field_t*	cfg_field = cfg_index->m_fields;
+
+	for (ulint i = 0; i < index->n_fields; ++i, ++field, ++cfg_field) {
+
+		if (field->name() && cfg_field->name()
+		     && strcmp(field->name(), cfg_field->name()) != 0) {
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLE_SCHEMA_MISMATCH,
+				"Index field name %s doesn't match"
+				" tablespace metadata field name %s"
+				" for field position " ULINTPF,
+				field->name(), cfg_field->name(), i);
+
+			err = DB_ERROR;
+		}
+
+		if (cfg_field->prefix_len != field->prefix_len) {
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLE_SCHEMA_MISMATCH,
+				"Index %s field %s prefix len %u"
+				" doesn't match metadata file value %u",
+				index->name(), field->name(),
+				field->prefix_len, cfg_field->prefix_len);
+
+			err = DB_ERROR;
+		}
+
+		if (cfg_field->fixed_len != field->fixed_len) {
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLE_SCHEMA_MISMATCH,
+				"Index %s field %s fixed len %u"
+				" doesn't match metadata file value %u",
+				index->name(), field->name(),
+				field->fixed_len,
+				cfg_field->fixed_len);
+
+			err = DB_ERROR;
+		}
+	}
+
+	return(err);
+}
+
+/** Check if the table schema that was read from the .cfg file matches the
+in memory table definition.
+@param thd MySQL session variable
+@return DB_SUCCESS or error code. */
+dberr_t
+row_import::match_table_columns(
+	THD*			thd) UNIV_NOTHROW
+{
+	dberr_t			err = DB_SUCCESS;
+	const dict_col_t*	col = m_table->cols;
+
+	for (ulint i = 0; i < m_table->n_cols; ++i, ++col) {
+
+		const char*	col_name;
+		ulint		cfg_col_index;
+
+		col_name = dict_table_get_col_name(
+			m_table, dict_col_get_no(col));
+
+		cfg_col_index = find_col(col_name);
+
+		if (cfg_col_index == ULINT_UNDEFINED) {
+
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				 ER_TABLE_SCHEMA_MISMATCH,
+				 "Column %s not found in tablespace.",
+				 col_name);
+
+			err = DB_ERROR;
+		} else if (cfg_col_index != col->ind) {
+
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLE_SCHEMA_MISMATCH,
+				"Column %s ordinal value mismatch, it's at %u"
+				" in the table and " ULINTPF
+				" in the tablespace meta-data file",
+				col_name, col->ind, cfg_col_index);
+
+			err = DB_ERROR;
+		} else {
+			const dict_col_t*	cfg_col;
+
+			cfg_col = &m_cols[cfg_col_index];
+			ut_a(cfg_col->ind == cfg_col_index);
+
+			if (cfg_col->prtype != col->prtype) {
+				ib_errf(thd,
+					IB_LOG_LEVEL_ERROR,
+					ER_TABLE_SCHEMA_MISMATCH,
+					"Column %s precise type mismatch,"
+					" it's 0X%X in the table and 0X%X"
+					" in the tablespace meta file",
+					col_name, col->prtype, cfg_col->prtype);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->mtype != col->mtype) {
+				ib_errf(thd,
+					IB_LOG_LEVEL_ERROR,
+					ER_TABLE_SCHEMA_MISMATCH,
+					"Column %s main type mismatch,"
+					" it's 0X%X in the table and 0X%X"
+					" in the tablespace meta file",
+					col_name, col->mtype, cfg_col->mtype);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->len != col->len) {
+				ib_errf(thd,
+					IB_LOG_LEVEL_ERROR,
+					ER_TABLE_SCHEMA_MISMATCH,
+					"Column %s length mismatch,"
+					" it's %u in the table and %u"
+					" in the tablespace meta file",
+					col_name, col->len, cfg_col->len);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->mbminlen != col->mbminlen
+			    || cfg_col->mbmaxlen != col->mbmaxlen) {
+				ib_errf(thd,
+					IB_LOG_LEVEL_ERROR,
+					ER_TABLE_SCHEMA_MISMATCH,
+					"Column %s multi-byte len mismatch,"
+					" it's %u-%u in the table and %u-%u"
+					" in the tablespace meta file",
+					col_name, col->mbminlen, col->mbmaxlen,
+					cfg_col->mbminlen, cfg_col->mbmaxlen);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->ind != col->ind) {
+				ib_errf(thd,
+					IB_LOG_LEVEL_ERROR,
+					ER_TABLE_SCHEMA_MISMATCH,
+					"Column %s position mismatch,"
+					" it's %u in the table and %u"
+					" in the tablespace meta file",
+					col_name, col->ind, cfg_col->ind);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->ord_part != col->ord_part) {
+				ib_errf(thd,
+					IB_LOG_LEVEL_ERROR,
+					ER_TABLE_SCHEMA_MISMATCH,
+					"Column %s ordering mismatch,"
+					" it's %u in the table and %u"
+					" in the tablespace meta file",
+					col_name, col->ord_part,
+					cfg_col->ord_part);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->max_prefix != col->max_prefix) {
+				ib_errf(thd,
+					IB_LOG_LEVEL_ERROR,
+					ER_TABLE_SCHEMA_MISMATCH,
+					"Column %s max prefix mismatch"
+					" it's %u in the table and %u"
+					" in the tablespace meta file",
+					col_name, col->max_prefix,
+					cfg_col->max_prefix);
+				err = DB_ERROR;
+			}
+		}
+	}
+
+	return(err);
+}
+
+dberr_t row_import::match_flags(THD *thd) const
+{
+  ulint mismatch= (m_table->flags ^ m_flags) & ~DICT_TF_MASK_DATA_DIR;
+  if (!mismatch)
+    return DB_SUCCESS;
+
+  const char *msg;
+  if (mismatch & DICT_TF_MASK_ZIP_SSIZE)
+  {
+    if ((m_table->flags & DICT_TF_MASK_ZIP_SSIZE) &&
+        (m_flags & DICT_TF_MASK_ZIP_SSIZE))
+    {
+      switch (m_flags & DICT_TF_MASK_ZIP_SSIZE) {
+      case 0U << DICT_TF_POS_ZIP_SSIZE:
+        goto uncompressed;
+      case 1U << DICT_TF_POS_ZIP_SSIZE:
+        msg= "ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1";
+        break;
+      case 2U << DICT_TF_POS_ZIP_SSIZE:
+        msg= "ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=2";
+        break;
+      case 3U << DICT_TF_POS_ZIP_SSIZE:
+        msg= "ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4";
+        break;
+      case 4U << DICT_TF_POS_ZIP_SSIZE:
+        msg= "ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8";
+        break;
+      case 5U << DICT_TF_POS_ZIP_SSIZE:
+        msg= "ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=16";
+        break;
+      default:
+        msg= "strange KEY_BLOCK_SIZE";
+      }
+    }
+    else if (m_flags & DICT_TF_MASK_ZIP_SSIZE)
+      msg= "ROW_FORMAT=COMPRESSED";
+    else
+      goto uncompressed;
+  }
+  else
+  {
+  uncompressed:
+    msg= (m_flags & DICT_TF_MASK_ATOMIC_BLOBS) ? "ROW_FORMAT=DYNAMIC"
+         : (m_flags & DICT_TF_MASK_COMPACT)    ? "ROW_FORMAT=COMPACT"
+                                               : "ROW_FORMAT=REDUNDANT";
+  }
+
+  ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+          "Table flags don't match, server table has 0x%x and the meta-data "
+          "file has 0x%zx; .cfg file uses %s",
+          m_table->flags, m_flags, msg);
+
+  return DB_ERROR;
+}
+
+/** Check if the table (and index) schema that was read from the .cfg file
+matches the in memory table definition.
+@param thd MySQL session variable
+@return DB_SUCCESS or error code. */
+dberr_t
+row_import::match_schema(
+	THD*		thd) UNIV_NOTHROW
+{
+	/* Do some simple checks. */
+
+	if (UT_LIST_GET_LEN(m_table->indexes) != m_n_indexes) {
+
+		/* If the number of indexes don't match then it is better
+		to abort the IMPORT. It is easy for the user to create a
+		table matching the IMPORT definition. */
+
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+			"Number of indexes don't match, table has " ULINTPF
+			" indexes but the tablespace meta-data file has "
+			ULINTPF " indexes",
+			UT_LIST_GET_LEN(m_table->indexes), m_n_indexes);
+
+		return(DB_ERROR);
+	}
+
+	dberr_t	err = match_table_columns(thd);
+
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	/* Check if the index definitions match. */
+
+	const dict_index_t* index;
+
+	for (index = UT_LIST_GET_FIRST(m_table->indexes);
+	     index != 0;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+		dberr_t	index_err;
+
+		index_err = match_index_columns(thd, index);
+
+		if (index_err != DB_SUCCESS) {
+			err = index_err;
+		}
+	}
+
+	return(err);
+}
+
+/**
+Set the index root <space, pageno>, using index name. */
+void
+row_import::set_root_by_name() UNIV_NOTHROW
+{
+	row_index_t*	cfg_index = m_indexes;
+
+	for (ulint i = 0; i < m_n_indexes; ++i, ++cfg_index) {
+		dict_index_t*	index;
+
+		const char*	index_name;
+
+		index_name = reinterpret_cast<const char*>(cfg_index->m_name);
+
+		index = dict_table_get_index_on_name(m_table, index_name);
+
+		/* We've already checked that it exists. */
+		ut_a(index != 0);
+
+		index->page = cfg_index->m_page_no;
+	}
+}
+
+/**
+Set the index root <space, pageno>, using a heuristic.
+@return DB_SUCCESS or error code */
+dberr_t
+row_import::set_root_by_heuristic() UNIV_NOTHROW
+{
+	row_index_t*	cfg_index = m_indexes;
+
+	ut_a(m_n_indexes > 0);
+
+	// TODO: For now use brute force, based on ordinality
+
+	if (UT_LIST_GET_LEN(m_table->indexes) != m_n_indexes) {
+
+		ib::warn() << "Table " << m_table->name << " should have "
+			<< UT_LIST_GET_LEN(m_table->indexes) << " indexes but"
+			" the tablespace has " << m_n_indexes << " indexes";
+	}
+
+	ulint	i = 0;
+	dberr_t	err = DB_SUCCESS;
+
+	for (dict_index_t* index = UT_LIST_GET_FIRST(m_table->indexes);
+	     index != 0;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+		if (index->type & DICT_FTS) {
+			index->type |= DICT_CORRUPT;
+			ib::warn() << "Skipping FTS index: " << index->name;
+		} else if (i < m_n_indexes) {
+
+			UT_DELETE_ARRAY(cfg_index[i].m_name);
+
+			ulint	len = strlen(index->name) + 1;
+
+			cfg_index[i].m_name = UT_NEW_ARRAY_NOKEY(byte, len);
+
+			/* Trigger OOM */
+			DBUG_EXECUTE_IF(
+				"ib_import_OOM_14",
+				UT_DELETE_ARRAY(cfg_index[i].m_name);
+				cfg_index[i].m_name = NULL;
+			);
+
+			if (cfg_index[i].m_name == NULL) {
+				err = DB_OUT_OF_MEMORY;
+				break;
+			}
+
+			memcpy(cfg_index[i].m_name, index->name, len);
+
+			cfg_index[i].m_srv_index = index;
+
+			index->page = cfg_index[i++].m_page_no;
+		}
+	}
+
+	return(err);
+}
+
+/**
+Purge delete marked records.
+@return DB_SUCCESS or error code. */
+dberr_t
+IndexPurge::garbage_collect() UNIV_NOTHROW
+{
+	ibool	comp = dict_table_is_comp(m_index->table);
+
+	/* Open the persistent cursor and start the mini-transaction. */
+
+	dberr_t err = open() ? next() : DB_CORRUPTION;
+
+	for (; err == DB_SUCCESS; err = next()) {
+
+		rec_t*	rec = btr_pcur_get_rec(&m_pcur);
+		ibool	deleted = rec_get_deleted_flag(rec, comp);
+
+		if (!deleted) {
+			++m_n_rows;
+		} else {
+			err = purge();
+			if (err != DB_SUCCESS) {
+				break;
+			}
+		}
+	}
+
+	/* Close the persistent cursor and commit the mini-transaction. */
+
+	close();
+
+	return(err == DB_END_OF_INDEX ? DB_SUCCESS : err);
+}
+
+/**
+Begin import, position the cursor on the first record. */
+inline bool IndexPurge::open() noexcept
+{
+  m_mtr.start();
+  m_mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+  btr_pcur_init(&m_pcur);
+
+  if (m_pcur.open_leaf(true, m_index, BTR_MODIFY_LEAF, &m_mtr) != DB_SUCCESS)
+    return false;
+
+  rec_t *rec= page_rec_get_next(btr_pcur_get_rec(&m_pcur));
+  if (!rec)
+    return false;
+  if (rec_is_metadata(rec, *m_index))
+    /* Skip the metadata pseudo-record. */
+    btr_pcur_get_page_cur(&m_pcur)->rec= rec;
+  return true;
+}
+
+/**
+Position the cursor on the next record.
+@return DB_SUCCESS or error code */
+dberr_t IndexPurge::next() noexcept
+{
+	if (UNIV_UNLIKELY(!btr_pcur_move_to_next_on_page(&m_pcur))) {
+		return DB_CORRUPTION;
+	}
+
+	/* When switching pages, commit the mini-transaction
+	in order to release the latch on the old page. */
+
+	if (!btr_pcur_is_after_last_on_page(&m_pcur)) {
+		return(DB_SUCCESS);
+	} else if (trx_is_interrupted(m_trx)) {
+		/* Check after every page because the check
+		is expensive. */
+		return(DB_INTERRUPTED);
+	}
+
+	btr_pcur_store_position(&m_pcur, &m_mtr);
+
+	mtr_commit(&m_mtr);
+
+	mtr_start(&m_mtr);
+
+	mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO);
+
+	if (m_pcur.restore_position(BTR_MODIFY_LEAF, &m_mtr)
+	    == btr_pcur_t::CORRUPTED) {
+		return DB_CORRUPTION;
+	}
+	/* The following is based on btr_pcur_move_to_next_user_rec(). */
+	m_pcur.old_rec = nullptr;
+	ut_ad(m_pcur.latch_mode == BTR_MODIFY_LEAF);
+	do {
+		if (btr_pcur_is_after_last_on_page(&m_pcur)) {
+			if (btr_pcur_is_after_last_in_tree(&m_pcur)) {
+				return DB_END_OF_INDEX;
+			}
+
+			if (dberr_t err = btr_pcur_move_to_next_page(&m_pcur,
+								     &m_mtr)) {
+				return err;
+			}
+		} else if (!btr_pcur_move_to_next_on_page(&m_pcur)) {
+			return DB_CORRUPTION;
+		}
+	} while (!btr_pcur_is_on_user_rec(&m_pcur));
+
+	return DB_SUCCESS;
+}
+
+/**
+Store the persistent cursor position and reopen the
+B-tree cursor in BTR_MODIFY_TREE mode, because the
+tree structure may be changed during a pessimistic delete. */
+inline dberr_t IndexPurge::purge_pessimistic_delete() noexcept
+{
+  dberr_t err;
+  if (m_pcur.restore_position(BTR_PURGE_TREE, &m_mtr) != btr_pcur_t::CORRUPTED)
+  {
+    ut_ad(rec_get_deleted_flag(btr_pcur_get_rec(&m_pcur),
+                               m_index->table->not_redundant()));
+    btr_cur_pessimistic_delete(&err, FALSE, btr_pcur_get_btr_cur(&m_pcur), 0,
+                               false, &m_mtr);
+  }
+  else
+    err= DB_CORRUPTION;
+
+  m_mtr.commit();
+  return err;
+}
+
+dberr_t IndexPurge::purge() noexcept
+{
+  btr_pcur_store_position(&m_pcur, &m_mtr);
+  m_mtr.commit();
+  m_mtr.start();
+  m_mtr.set_log_mode(MTR_LOG_NO_REDO);
+  dberr_t err= purge_pessimistic_delete();
+
+  m_mtr.start();
+  m_mtr.set_log_mode(MTR_LOG_NO_REDO);
+  if (err == DB_SUCCESS)
+    err= (m_pcur.restore_position(BTR_MODIFY_LEAF, &m_mtr) ==
+          btr_pcur_t::CORRUPTED)
+      ? DB_CORRUPTION : DB_SUCCESS;
+  return err;
+}
+
+/** Adjust the BLOB reference for a single column that is externally stored
+@param rec record to update
+@param offsets column offsets for the record
+@param i column ordinal value
+@return DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::adjust_cluster_index_blob_column(
+	rec_t*		rec,
+	const rec_offs*	offsets,
+	ulint		i) UNIV_NOTHROW
+{
+	ulint		len;
+	byte*		field;
+
+	field = rec_get_nth_field(rec, offsets, i, &len);
+
+	DBUG_EXECUTE_IF("ib_import_trigger_corruption_2",
+			len = BTR_EXTERN_FIELD_REF_SIZE - 1;);
+
+	if (len < BTR_EXTERN_FIELD_REF_SIZE) {
+
+		ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_INNODB_INDEX_CORRUPT,
+			"Externally stored column(" ULINTPF
+			") has a reference length of " ULINTPF
+			" in the cluster index %s",
+			i, len, m_cluster_index->name());
+
+		return(DB_CORRUPTION);
+	}
+
+	field += len - (BTR_EXTERN_FIELD_REF_SIZE - BTR_EXTERN_SPACE_ID);
+
+	mach_write_to_4(field, get_space_id());
+
+	if (UNIV_LIKELY_NULL(m_rec_iter.current_block()->page.zip.data)) {
+		page_zip_write_blob_ptr(
+			m_rec_iter.current_block(), rec, m_cluster_index,
+			offsets, i, &m_rec_iter.m_mtr);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/** Adjusts the BLOB reference in the clustered index row for all externally
+stored columns.
+@param rec record to update
+@param offsets column offsets for the record
+@return DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::adjust_cluster_index_blob_columns(
+	rec_t*		rec,
+	const rec_offs*	offsets) UNIV_NOTHROW
+{
+	ut_ad(rec_offs_any_extern(offsets));
+
+	/* Adjust the space_id in the BLOB pointers. */
+
+	for (ulint i = 0; i < rec_offs_n_fields(offsets); ++i) {
+
+		/* Only if the column is stored "externally". */
+
+		if (rec_offs_nth_extern(offsets, i)) {
+			dberr_t	err;
+
+			err = adjust_cluster_index_blob_column(rec, offsets, i);
+
+			if (err != DB_SUCCESS) {
+				return(err);
+			}
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/** In the clustered index, adjust BLOB pointers as needed. Also update the
+BLOB reference, write the new space id.
+@param rec record to update
+@param offsets column offsets for the record
+@return DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::adjust_cluster_index_blob_ref(
+	rec_t*		rec,
+	const rec_offs*	offsets) UNIV_NOTHROW
+{
+	if (rec_offs_any_extern(offsets)) {
+		dberr_t	err;
+
+		err = adjust_cluster_index_blob_columns(rec, offsets);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/** Purge delete-marked records, only if it is possible to do so without
+re-organising the B+tree.
+@return true if purge succeeded */
+inline bool PageConverter::purge() UNIV_NOTHROW
+{
+	/* We can't have a page that is empty and not root. */
+	if (m_rec_iter.remove(m_offsets)) {
+
+		++m_index->m_stats.m_n_purged;
+
+		return(true);
+	} else {
+		++m_index->m_stats.m_n_purge_failed;
+	}
+
+	return(false);
+}
+
+/** Adjust the BLOB references and sys fields for the current record.
+@param rec record to update
+@param offsets column offsets for the record
+@return DB_SUCCESS or error code. */
+inline
+dberr_t
+PageConverter::adjust_cluster_record(
+	rec_t*			rec,
+	const rec_offs*		offsets) UNIV_NOTHROW
+{
+	dberr_t	err;
+
+	if ((err = adjust_cluster_index_blob_ref(rec, offsets)) == DB_SUCCESS) {
+
+		/* Reset DB_TRX_ID and DB_ROLL_PTR.  Normally, these fields
+		are only written in conjunction with other changes to the
+		record. */
+		ulint	trx_id_pos = m_cluster_index->n_uniq
+			? m_cluster_index->n_uniq : 1;
+		if (UNIV_LIKELY_NULL(m_rec_iter.current_block()
+				     ->page.zip.data)) {
+			page_zip_write_trx_id_and_roll_ptr(
+				m_rec_iter.current_block(),
+				rec, m_offsets, trx_id_pos,
+				0, roll_ptr_t(1) << ROLL_PTR_INSERT_FLAG_POS,
+				&m_rec_iter.m_mtr);
+		} else {
+			ulint	len;
+			byte*	ptr = rec_get_nth_field(
+				rec, m_offsets, trx_id_pos, &len);
+			ut_ad(len == DATA_TRX_ID_LEN);
+			memcpy(ptr, reset_trx_id, sizeof reset_trx_id);
+		}
+	}
+
+	return(err);
+}
+
+/** Update the BLOB refrences and write UNDO log entries for
+rows that can't be purged optimistically.
+@param block block to update
+@retval DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::update_records(
+	buf_block_t*	block) UNIV_NOTHROW
+{
+	ibool	comp = dict_table_is_comp(m_cfg->m_table);
+	bool	clust_index = m_index->m_srv_index == m_cluster_index;
+
+	/* This will also position the cursor on the first user record. */
+
+	if (!m_rec_iter.open(block, m_index->m_srv_index)) {
+		return DB_CORRUPTION;
+	}
+
+	while (!m_rec_iter.end()) {
+		rec_t*	rec = m_rec_iter.current();
+		ibool	deleted = rec_get_deleted_flag(rec, comp);
+
+		/* For the clustered index we have to adjust the BLOB
+		reference and the system fields irrespective of the
+		delete marked flag. The adjustment of delete marked
+		cluster records is required for purge to work later. */
+
+		if (deleted || clust_index) {
+			m_offsets = rec_get_offsets(
+				rec, m_index->m_srv_index, m_offsets,
+				m_index->m_srv_index->n_core_fields,
+				ULINT_UNDEFINED, &m_heap);
+		}
+
+		if (clust_index) {
+
+			dberr_t err = adjust_cluster_record(rec, m_offsets);
+
+			if (err != DB_SUCCESS) {
+				return(err);
+			}
+		}
+
+		/* If it is a delete marked record then try an
+		optimistic delete. */
+
+		if (deleted) {
+			++m_index->m_stats.m_n_deleted;
+			/* A successful purge will move the cursor to the
+			next record. */
+
+			if (purge()) {
+				continue;
+			}
+		} else {
+			++m_index->m_stats.m_n_rows;
+		}
+
+		if (!m_rec_iter.next()) {
+			return DB_CORRUPTION;
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/** Update the space, index id, trx id.
+@return DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::update_index_page(
+	buf_block_t*	block) UNIV_NOTHROW
+{
+	const page_id_t page_id(block->page.id());
+
+	if (is_free(page_id.page_no())) {
+		return(DB_SUCCESS);
+	}
+
+	buf_frame_t* page = block->page.frame;
+	const index_id_t id = btr_page_get_index_id(page);
+
+	if (id != m_index->m_id) {
+		row_index_t* index = find_index(id);
+
+		if (UNIV_UNLIKELY(!index)) {
+			if (!m_cfg->m_missing) {
+				ib::warn() << "Unknown index id " << id
+					   << " on page " << page_id.page_no();
+			}
+			return DB_SUCCESS;
+		}
+
+		m_index = index;
+	}
+
+	/* If the .cfg file is missing and there is an index mismatch
+	then ignore the error. */
+	if (m_cfg->m_missing && !m_index->m_srv_index) {
+		return(DB_SUCCESS);
+	}
+
+	if (m_index && page_id.page_no() == m_index->m_page_no) {
+		byte *b = FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + FSEG_HDR_SPACE
+			+ page;
+		mach_write_to_4(b, page_id.space());
+
+		memcpy(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + FSEG_HDR_SPACE
+		       + page, b, 4);
+		if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+			memcpy(&block->page.zip.data[FIL_PAGE_DATA
+						     + PAGE_BTR_SEG_TOP
+						     + FSEG_HDR_SPACE], b, 4);
+			memcpy(&block->page.zip.data[FIL_PAGE_DATA
+						     + PAGE_BTR_SEG_LEAF
+						     + FSEG_HDR_SPACE], b, 4);
+		}
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!block->page.zip.data || page_zip_validate(&block->page.zip, page,
+							m_index->m_srv_index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	/* This has to be written to uncompressed index header. Set it to
+	the current index id. */
+	mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID),
+			m_index->m_srv_index->id);
+	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+		memcpy(&block->page.zip.data[PAGE_HEADER + PAGE_INDEX_ID],
+		       &block->page.frame[PAGE_HEADER + PAGE_INDEX_ID], 8);
+	}
+
+	if (m_index->m_srv_index->is_clust()) {
+		if (page_id.page_no() != m_index->m_srv_index->page) {
+			goto clear_page_max_trx_id;
+		}
+	} else if (page_is_leaf(page)) {
+		/* Set PAGE_MAX_TRX_ID on secondary index leaf pages. */
+		mach_write_to_8(&block->page.frame
+				[PAGE_HEADER + PAGE_MAX_TRX_ID], m_trx->id);
+		if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+			memcpy_aligned<8>(&block->page.zip.data
+					  [PAGE_HEADER + PAGE_MAX_TRX_ID],
+					  &block->page.frame
+					  [PAGE_HEADER + PAGE_MAX_TRX_ID], 8);
+		}
+	} else {
+clear_page_max_trx_id:
+		/* Clear PAGE_MAX_TRX_ID so that it can be
+		used for other purposes in the future. IMPORT
+		in MySQL 5.6, 5.7 and MariaDB 10.0 and 10.1
+		would set the field to the transaction ID even
+		on clustered index pages. */
+		memset_aligned<8>(&block->page.frame
+				  [PAGE_HEADER + PAGE_MAX_TRX_ID],
+				  0, 8);
+		if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+			memset_aligned<8>(&block->page.zip.data
+					  [PAGE_HEADER + PAGE_MAX_TRX_ID],
+					  0, 8);
+		}
+	}
+
+	if (page_is_empty(page)) {
+
+		/* Only a root page can be empty. */
+		if (page_has_siblings(page)) {
+			// TODO: We should relax this and skip secondary
+			// indexes. Mark them as corrupt because they can
+			// always be rebuilt.
+			return(DB_CORRUPTION);
+		}
+
+		return(DB_SUCCESS);
+	}
+
+	return page_is_leaf(block->page.frame)
+		? update_records(block)
+		: DB_SUCCESS;
+}
+
+/** Validate the space flags and update tablespace header page.
+@param block block read from file, not from the buffer pool.
+@retval DB_SUCCESS or error code */
+inline dberr_t PageConverter::update_header(buf_block_t* block) UNIV_NOTHROW
+{
+  byte *frame= get_frame(block);
+  if (memcmp_aligned<2>(FIL_PAGE_SPACE_ID + frame,
+                        FSP_HEADER_OFFSET + FSP_SPACE_ID + frame, 4))
+    ib::warn() << "Space id check in the header failed: ignored";
+  else if (!mach_read_from_4(FIL_PAGE_SPACE_ID + frame))
+    return DB_CORRUPTION;
+
+  memset(frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
+
+  /* Write space_id to the tablespace header, page 0. */
+  mach_write_to_4(FIL_PAGE_SPACE_ID + frame, get_space_id());
+  memcpy_aligned<2>(FSP_HEADER_OFFSET + FSP_SPACE_ID + frame,
+                    FIL_PAGE_SPACE_ID + frame, 4);
+  /* Write back the adjusted flags. */
+  mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + frame, m_space_flags);
+
+  return DB_SUCCESS;
+}
+
+/** Update the page, set the space id, max trx id and index id.
+@param block block read from file
+@retval DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::update_page(buf_block_t* block, uint16_t& page_type)
+	UNIV_NOTHROW
+{
+	dberr_t		err = DB_SUCCESS;
+
+	ut_ad(!block->page.zip.data == !is_compressed_table());
+
+	switch (page_type = fil_page_get_type(get_frame(block))) {
+	case FIL_PAGE_TYPE_FSP_HDR:
+		ut_a(block->page.id().page_no() == 0);
+		/* Work directly on the uncompressed page headers. */
+		return(update_header(block));
+
+	case FIL_PAGE_INDEX:
+	case FIL_PAGE_RTREE:
+		/* We need to decompress the contents
+		before we can do anything. */
+
+		if (is_compressed_table() && !buf_zip_decompress(block, TRUE)) {
+			return(DB_CORRUPTION);
+		}
+
+		/* fall through */
+	case FIL_PAGE_TYPE_INSTANT:
+		/* This is on every page in the tablespace. */
+		mach_write_to_4(
+			get_frame(block)
+			+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, get_space_id());
+
+		/* Only update the Btree nodes. */
+		return(update_index_page(block));
+
+	case FIL_PAGE_TYPE_SYS:
+		/* This is page 0 in the system tablespace. */
+		return(DB_CORRUPTION);
+
+	case FIL_PAGE_TYPE_XDES:
+		err = set_current_xdes(
+			block->page.id().page_no(), get_frame(block));
+		/* fall through */
+	case FIL_PAGE_INODE:
+	case FIL_PAGE_TYPE_TRX_SYS:
+	case FIL_PAGE_IBUF_FREE_LIST:
+	case FIL_PAGE_TYPE_ALLOCATED:
+	case FIL_PAGE_IBUF_BITMAP:
+	case FIL_PAGE_TYPE_BLOB:
+	case FIL_PAGE_TYPE_ZBLOB:
+	case FIL_PAGE_TYPE_ZBLOB2:
+
+		/* Work directly on the uncompressed page headers. */
+		/* This is on every page in the tablespace. */
+		mach_write_to_4(
+			get_frame(block)
+			+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, get_space_id());
+
+		return(err);
+	}
+
+	ib::warn() << "Unknown page type (" << page_type << ")";
+
+	return(DB_CORRUPTION);
+}
+
+/** Called for every page in the tablespace. If the page was not
+updated then its state must be set to BUF_PAGE_NOT_USED.
+@param block block read from file, note it is not from the buffer pool
+@retval DB_SUCCESS or error code. */
+dberr_t PageConverter::operator()(buf_block_t* block) UNIV_NOTHROW
+{
+	/* If we already had an old page with matching number
+	in the buffer pool, evict it now, because
+	we no longer evict the pages on DISCARD TABLESPACE. */
+	buf_page_get_low(block->page.id(), get_zip_size(), RW_NO_LATCH,
+			 nullptr, BUF_PEEK_IF_IN_POOL,
+			 nullptr, nullptr, false);
+
+	uint16_t page_type;
+
+	if (dberr_t err = update_page(block, page_type)) {
+		return err;
+	}
+
+	const bool full_crc32 = fil_space_t::full_crc32(get_space_flags());
+	byte* frame = get_frame(block);
+	memset_aligned<8>(frame + FIL_PAGE_LSN, 0, 8);
+
+	if (!block->page.zip.data) {
+		buf_flush_init_for_writing(
+			NULL, block->page.frame, NULL, full_crc32);
+	} else if (fil_page_type_is_index(page_type)) {
+		buf_flush_init_for_writing(
+			NULL, block->page.zip.data, &block->page.zip,
+			full_crc32);
+	} else {
+		/* Calculate and update the checksum of non-index
+		pages for ROW_FORMAT=COMPRESSED tables. */
+		buf_flush_update_zip_checksum(
+			block->page.zip.data, block->zip_size());
+	}
+
+	return DB_SUCCESS;
+}
+
+/*****************************************************************//**
+Clean up after import tablespace. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_cleanup(
+/*===============*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt from handler */
+	dberr_t		err)		/*!< in: error code */
+{
+	if (err != DB_SUCCESS) {
+		dict_table_t* table = prebuilt->table;
+		table->file_unreadable = true;
+		if (table->space) {
+			fil_close_tablespace(table->space_id);
+			table->space = NULL;
+		}
+
+		prebuilt->trx->error_info = NULL;
+
+		ib::info() << "Discarding tablespace of table "
+			   << table->name << ": " << err;
+
+		for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+		     index;
+		     index = UT_LIST_GET_NEXT(indexes, index)) {
+			index->page = FIL_NULL;
+		}
+	}
+
+	DBUG_EXECUTE_IF("ib_import_before_commit_crash", DBUG_SUICIDE(););
+
+	prebuilt->trx->commit();
+
+	if (prebuilt->trx->dict_operation_lock_mode) {
+		row_mysql_unlock_data_dictionary(prebuilt->trx);
+	}
+
+	prebuilt->trx->op_info = "";
+
+	DBUG_EXECUTE_IF("ib_import_before_checkpoint_crash", DBUG_SUICIDE(););
+
+	return(err);
+}
+
+/*****************************************************************//**
+Report error during tablespace import. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_error(
+/*=============*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt from handler */
+	dberr_t		err)		/*!< in: error code */
+{
+	if (!trx_is_interrupted(prebuilt->trx)) {
+		char	table_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			table_name, sizeof(table_name),
+			prebuilt->table->name.m_name);
+
+		ib_senderrf(
+			prebuilt->trx->mysql_thd, IB_LOG_LEVEL_WARN,
+			ER_INNODB_IMPORT_ERROR,
+			table_name, (ulong) err, ut_strerr(err));
+	}
+
+	return row_import_cleanup(prebuilt, err);
+}
+
+/*****************************************************************//**
+Adjust the root page index node and leaf node segment headers, update
+with the new space id. For all the table's secondary indexes.
+@return error code */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_adjust_root_pages_of_secondary_indexes(
+/*==============================================*/
+	trx_t*			trx,		/*!< in: transaction used for
+						the import */
+	dict_table_t*		table,		/*!< in: table the indexes
+						belong to */
+	const row_import&	cfg)		/*!< Import context */
+{
+	dict_index_t*		index;
+	ulint			n_rows_in_table;
+	dberr_t			err = DB_SUCCESS;
+
+	/* Skip the clustered index. */
+	index = dict_table_get_first_index(table);
+
+	n_rows_in_table = cfg.get_n_rows(index->name);
+
+	DBUG_EXECUTE_IF("ib_import_sec_rec_count_mismatch_failure",
+			n_rows_in_table++;);
+
+	/* Adjust the root pages of the secondary indexes only. */
+	while ((index = dict_table_get_next_index(index)) != NULL) {
+		ut_a(!dict_index_is_clust(index));
+
+		if (!(index->type & DICT_CORRUPT)
+		    && index->page != FIL_NULL) {
+
+			/* Update the Btree segment headers for index node and
+			leaf nodes in the root page. Set the new space id. */
+
+			err = btr_root_adjust_on_import(index);
+		} else {
+			ib::warn() << "Skip adjustment of root pages for"
+				" index " << index->name << ".";
+
+			err = DB_CORRUPTION;
+		}
+
+		if (err != DB_SUCCESS) {
+
+			if (index->type & DICT_CLUSTERED) {
+				break;
+			}
+
+			ib_errf(trx->mysql_thd,
+				IB_LOG_LEVEL_WARN,
+				ER_INNODB_INDEX_CORRUPT,
+				"Index %s not found or corrupt,"
+				" you should recreate this index.",
+				index->name());
+
+			/* Do not bail out, so that the data
+			can be recovered. */
+
+			err = DB_SUCCESS;
+			index->type |= DICT_CORRUPT;
+			continue;
+		}
+
+		/* If we failed to purge any records in the index then
+		do it the hard way.
+
+		TODO: We can do this in the first pass by generating UNDO log
+		records for the failed rows. */
+
+		if (!cfg.requires_purge(index->name)) {
+			continue;
+		}
+
+		IndexPurge   purge(trx, index);
+
+		trx->op_info = "secondary: purge delete marked records";
+
+		err = purge.garbage_collect();
+
+		trx->op_info = "";
+
+		if (err != DB_SUCCESS) {
+			break;
+		} else if (purge.get_n_rows() != n_rows_in_table) {
+
+			ib_errf(trx->mysql_thd,
+				IB_LOG_LEVEL_WARN,
+				ER_INNODB_INDEX_CORRUPT,
+				"Index '%s' contains " ULINTPF " entries, "
+				"should be " ULINTPF ", you should recreate "
+				"this index.", index->name(),
+				purge.get_n_rows(), n_rows_in_table);
+
+			index->type |= DICT_CORRUPT;
+
+			/* Do not bail out, so that the data
+			can be recovered. */
+
+			err = DB_SUCCESS;
+                }
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Ensure that dict_sys.row_id exceeds SELECT MAX(DB_ROW_ID). */
+MY_ATTRIBUTE((nonnull)) static
+void
+row_import_set_sys_max_row_id(
+/*==========================*/
+	row_prebuilt_t*		prebuilt,	/*!< in/out: prebuilt from
+						handler */
+	const dict_table_t*	table)		/*!< in: table to import */
+{
+	const rec_t*		rec;
+	mtr_t			mtr;
+	btr_pcur_t		pcur;
+	row_id_t		row_id	= 0;
+	dict_index_t*		index;
+
+	index = dict_table_get_first_index(table);
+	ut_ad(index->is_primary());
+	ut_ad(dict_index_is_auto_gen_clust(index));
+
+	mtr_start(&mtr);
+
+	mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
+
+	if (pcur.open_leaf(false, index, BTR_SEARCH_LEAF, &mtr)
+	    == DB_SUCCESS) {
+		rec = btr_pcur_move_to_prev_on_page(&pcur);
+
+		if (!rec) {
+			/* The table is corrupted. */
+		} else if (page_rec_is_infimum(rec)) {
+			/* The table is empty. */
+		} else if (rec_is_metadata(rec, *index)) {
+			/* The clustered index contains the metadata
+			record only, that is, the table is empty. */
+		} else {
+			row_id = mach_read_from_6(rec);
+		}
+	}
+
+	mtr_commit(&mtr);
+
+	if (row_id) {
+		/* Update the system row id if the imported index row id is
+		greater than the max system row id. */
+		dict_sys.update_row_id(row_id);
+	}
+}
+
+/*****************************************************************//**
+Read the a string from the meta data file.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_import_cfg_read_string(
+/*=======================*/
+	FILE*		file,		/*!< in/out: File to read from */
+	byte*		ptr,		/*!< out: string to read */
+	ulint		max_len)	/*!< in: maximum length of the output
+					buffer in bytes */
+{
+	DBUG_EXECUTE_IF("ib_import_string_read_error",
+			errno = EINVAL; return(DB_IO_ERROR););
+
+	ulint		len = 0;
+
+	while (!feof(file)) {
+		int	ch = fgetc(file);
+
+		if (ch == EOF) {
+			break;
+		} else if (ch != 0) {
+			if (len < max_len) {
+				ptr[len++] = static_cast<byte>(ch);
+			} else {
+				break;
+			}
+		/* max_len includes the NUL byte */
+		} else if (len != max_len - 1) {
+			break;
+		} else {
+			ptr[len] = 0;
+			return(DB_SUCCESS);
+		}
+	}
+
+	errno = EINVAL;
+
+	return(DB_IO_ERROR);
+}
+
+/*********************************************************************//**
+Write the meta data (index user fields) config file.
+@return DB_SUCCESS or error code. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_cfg_read_index_fields(
+/*=============================*/
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd,	/*!< in/out: session */
+	row_index_t*		index)	/*!< Index being read in */
+{
+	byte			row[sizeof(ib_uint32_t) * 3];
+	ulint			n_fields = index->m_n_fields;
+
+	index->m_fields = UT_NEW_ARRAY_NOKEY(dict_field_t, n_fields);
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF(
+		"ib_import_OOM_4",
+		UT_DELETE_ARRAY(index->m_fields);
+		index->m_fields = NULL;
+	);
+
+	if (index->m_fields == NULL) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	dict_field_t*	field = index->m_fields;
+
+	for (ulint i = 0; i < n_fields; ++i, ++field) {
+		byte*		ptr = row;
+
+		/* Trigger EOF */
+		DBUG_EXECUTE_IF("ib_import_io_read_error_1",
+				(void) fseek(file, 0L, SEEK_END););
+
+		if (fread(row, 1, sizeof(row), file) != sizeof(row)) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				(ulong) errno, strerror(errno),
+				"while reading index fields.");
+
+			return(DB_IO_ERROR);
+		}
+
+		new (field) dict_field_t();
+
+		field->prefix_len = mach_read_from_4(ptr) & ((1U << 12) - 1);
+		ptr += sizeof(ib_uint32_t);
+
+		field->fixed_len = mach_read_from_4(ptr) & ((1U << 10) - 1);
+		ptr += sizeof(ib_uint32_t);
+
+		/* Include the NUL byte in the length. */
+		ulint	len = mach_read_from_4(ptr);
+
+		byte*	name = UT_NEW_ARRAY_NOKEY(byte, len);
+
+		/* Trigger OOM */
+		DBUG_EXECUTE_IF(
+			"ib_import_OOM_5",
+			UT_DELETE_ARRAY(name);
+			name = NULL;
+		);
+
+		if (name == NULL) {
+			return(DB_OUT_OF_MEMORY);
+		}
+
+		field->name = reinterpret_cast<const char*>(name);
+
+		dberr_t	err = row_import_cfg_read_string(file, name, len);
+
+		if (err != DB_SUCCESS) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				(ulong) errno, strerror(errno),
+				"while parsing table name.");
+
+			return(err);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Read the index names and root page numbers of the indexes and set the values.
+Row format [root_page_no, len of str, str ... ]
+@return DB_SUCCESS or error code. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_read_index_data(
+/*=======================*/
+	FILE*		file,		/*!< in: File to read from */
+	THD*		thd,		/*!< in: session */
+	row_import*	cfg)		/*!< in/out: meta-data read */
+{
+	byte*		ptr;
+	row_index_t*	cfg_index;
+	byte		row[sizeof(index_id_t) + sizeof(ib_uint32_t) * 9];
+
+	/* FIXME: What is the max value? */
+	ut_a(cfg->m_n_indexes > 0);
+	ut_a(cfg->m_n_indexes < 1024);
+
+	cfg->m_indexes = UT_NEW_ARRAY_NOKEY(row_index_t, cfg->m_n_indexes);
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF(
+		"ib_import_OOM_6",
+		UT_DELETE_ARRAY(cfg->m_indexes);
+		cfg->m_indexes = NULL;
+	);
+
+	if (cfg->m_indexes == NULL) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	memset(cfg->m_indexes, 0x0, sizeof(*cfg->m_indexes) * cfg->m_n_indexes);
+
+	cfg_index = cfg->m_indexes;
+
+	for (ulint i = 0; i < cfg->m_n_indexes; ++i, ++cfg_index) {
+		/* Trigger EOF */
+		DBUG_EXECUTE_IF("ib_import_io_read_error_2",
+				(void) fseek(file, 0L, SEEK_END););
+
+		/* Read the index data. */
+		size_t	n_bytes = fread(row, 1, sizeof(row), file);
+
+		/* Trigger EOF */
+		DBUG_EXECUTE_IF("ib_import_io_read_error",
+				(void) fseek(file, 0L, SEEK_END););
+
+		if (n_bytes != sizeof(row)) {
+			char	msg[BUFSIZ];
+
+			snprintf(msg, sizeof(msg),
+				 "while reading index meta-data, expected "
+				 "to read " ULINTPF
+				 " bytes but read only " ULINTPF " bytes",
+				 sizeof(row), n_bytes);
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				(ulong) errno, strerror(errno), msg);
+
+			ib::error() << "IO Error: " << msg;
+
+			return(DB_IO_ERROR);
+		}
+
+		ptr = row;
+
+		cfg_index->m_id = mach_read_from_8(ptr);
+		ptr += sizeof(index_id_t);
+
+		cfg_index->m_space = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_page_no = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_type = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_trx_id_offset = mach_read_from_4(ptr);
+		if (cfg_index->m_trx_id_offset != mach_read_from_4(ptr)) {
+			ut_ad(0);
+			/* Overflow. Pretend that the clustered index
+			has a variable-length PRIMARY KEY. */
+			cfg_index->m_trx_id_offset = 0;
+		}
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_n_user_defined_cols = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_n_uniq = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_n_nullable = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_n_fields = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		/* The NUL byte is included in the name length. */
+		ulint	len = mach_read_from_4(ptr);
+
+		if (len > OS_FILE_MAX_PATH) {
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				ER_INNODB_INDEX_CORRUPT,
+				"Index name length (" ULINTPF ") is too long, "
+				"the meta-data is corrupt", len);
+
+			return(DB_CORRUPTION);
+		}
+
+		cfg_index->m_name = UT_NEW_ARRAY_NOKEY(byte, len);
+
+		/* Trigger OOM */
+		DBUG_EXECUTE_IF(
+			"ib_import_OOM_7",
+			UT_DELETE_ARRAY(cfg_index->m_name);
+			cfg_index->m_name = NULL;
+		);
+
+		if (cfg_index->m_name == NULL) {
+			return(DB_OUT_OF_MEMORY);
+		}
+
+		dberr_t	err;
+
+		err = row_import_cfg_read_string(file, cfg_index->m_name, len);
+
+		if (err != DB_SUCCESS) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				(ulong) errno, strerror(errno),
+				"while parsing index name.");
+
+			return(err);
+		}
+
+		err = row_import_cfg_read_index_fields(file, thd, cfg_index);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Set the index root page number for v1 format.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_import_read_indexes(
+/*====================*/
+	FILE*		file,		/*!< in: File to read from */
+	THD*		thd,		/*!< in: session */
+	row_import*	cfg)		/*!< in/out: meta-data read */
+{
+	byte		row[sizeof(ib_uint32_t)];
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_3",
+			(void) fseek(file, 0L, SEEK_END););
+
+	/* Read the number of indexes. */
+	if (fread(row, 1, sizeof(row), file) != sizeof(row)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			(ulong) errno, strerror(errno),
+			"while reading number of indexes.");
+
+		return(DB_IO_ERROR);
+	}
+
+	cfg->m_n_indexes = mach_read_from_4(row);
+
+	if (cfg->m_n_indexes == 0) {
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			"Number of indexes in meta-data file is 0");
+
+		return(DB_CORRUPTION);
+
+	} else if (cfg->m_n_indexes > 1024) {
+		// FIXME: What is the upper limit? */
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			"Number of indexes in meta-data file is too high: "
+			ULINTPF, cfg->m_n_indexes);
+		cfg->m_n_indexes = 0;
+
+		return(DB_CORRUPTION);
+	}
+
+	return(row_import_read_index_data(file, thd, cfg));
+}
+
+/*********************************************************************//**
+Read the meta data (table columns) config file. Deserialise the contents of
+dict_col_t structure, along with the column name. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_read_columns(
+/*====================*/
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd,	/*!< in/out: session */
+	row_import*		cfg)	/*!< in/out: meta-data read */
+{
+	dict_col_t*		col;
+	byte			row[sizeof(ib_uint32_t) * 8];
+
+	/* FIXME: What should the upper limit be? */
+	ut_a(cfg->m_n_cols > 0);
+	ut_a(cfg->m_n_cols < 1024);
+
+	cfg->m_cols = UT_NEW_ARRAY_NOKEY(dict_col_t, cfg->m_n_cols);
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF(
+		"ib_import_OOM_8",
+		UT_DELETE_ARRAY(cfg->m_cols);
+		cfg->m_cols = NULL;
+	);
+
+	if (cfg->m_cols == NULL) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	cfg->m_col_names = UT_NEW_ARRAY_NOKEY(byte*, cfg->m_n_cols);
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF(
+		"ib_import_OOM_9",
+		UT_DELETE_ARRAY(cfg->m_col_names);
+		cfg->m_col_names = NULL;
+	);
+
+	if (cfg->m_col_names == NULL) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	memset(cfg->m_cols, 0x0, sizeof(cfg->m_cols) * cfg->m_n_cols);
+	memset(cfg->m_col_names, 0x0, sizeof(cfg->m_col_names) * cfg->m_n_cols);
+
+	col = cfg->m_cols;
+
+	for (ulint i = 0; i < cfg->m_n_cols; ++i, ++col) {
+		byte*		ptr = row;
+
+		/* Trigger EOF */
+		DBUG_EXECUTE_IF("ib_import_io_read_error_4",
+				(void) fseek(file, 0L, SEEK_END););
+
+		if (fread(row, 1,  sizeof(row), file) != sizeof(row)) {
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				(ulong) errno, strerror(errno),
+				"while reading table column meta-data.");
+
+			return(DB_IO_ERROR);
+		}
+
+		col->prtype = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		col->mtype = static_cast<byte>(mach_read_from_4(ptr));
+		ptr += sizeof(ib_uint32_t);
+
+		col->len = static_cast<uint16_t>(mach_read_from_4(ptr));
+		ptr += sizeof(ib_uint32_t);
+
+		uint32_t mbminmaxlen = mach_read_from_4(ptr);
+		col->mbmaxlen = (mbminmaxlen / 5) & 7;
+		col->mbminlen = (mbminmaxlen % 5) & 7;
+		ptr += sizeof(ib_uint32_t);
+
+		col->ind = mach_read_from_4(ptr) & dict_index_t::MAX_N_FIELDS;
+		ptr += sizeof(ib_uint32_t);
+
+		col->ord_part = mach_read_from_4(ptr) & 1;
+		ptr += sizeof(ib_uint32_t);
+
+		col->max_prefix = mach_read_from_4(ptr) & ((1U << 12) - 1);
+		ptr += sizeof(ib_uint32_t);
+
+		/* Read in the column name as [len, byte array]. The len
+		includes the NUL byte. */
+
+		ulint		len = mach_read_from_4(ptr);
+
+		/* FIXME: What is the maximum column name length? */
+		if (len == 0 || len > 128) {
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				ER_IO_READ_ERROR,
+				"Column name length " ULINTPF ", is invalid",
+				len);
+
+			return(DB_CORRUPTION);
+		}
+
+		cfg->m_col_names[i] = UT_NEW_ARRAY_NOKEY(byte, len);
+
+		/* Trigger OOM */
+		DBUG_EXECUTE_IF(
+			"ib_import_OOM_10",
+			UT_DELETE_ARRAY(cfg->m_col_names[i]);
+			cfg->m_col_names[i] = NULL;
+		);
+
+		if (cfg->m_col_names[i] == NULL) {
+			return(DB_OUT_OF_MEMORY);
+		}
+
+		dberr_t	err;
+
+		err = row_import_cfg_read_string(
+			file, cfg->m_col_names[i], len);
+
+		if (err != DB_SUCCESS) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				(ulong) errno, strerror(errno),
+				"while parsing table column name.");
+
+			return(err);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Read the contents of the <tablespace>.cfg file.
+@return DB_SUCCESS or error code. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_read_v1(
+/*===============*/
+	FILE*		file,		/*!< in: File to read from */
+	THD*		thd,		/*!< in: session */
+	row_import*	cfg)		/*!< out: meta data */
+{
+	byte		value[sizeof(ib_uint32_t)];
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_5",
+			(void) fseek(file, 0L, SEEK_END););
+
+	/* Read the hostname where the tablespace was exported. */
+	if (fread(value, 1, sizeof(value), file) != sizeof(value)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			(ulong) errno, strerror(errno),
+			"while reading meta-data export hostname length.");
+
+		return(DB_IO_ERROR);
+	}
+
+	ulint	len = mach_read_from_4(value);
+
+	/* NUL byte is part of name length. */
+	cfg->m_hostname = UT_NEW_ARRAY_NOKEY(byte, len);
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF(
+		"ib_import_OOM_1",
+		UT_DELETE_ARRAY(cfg->m_hostname);
+		cfg->m_hostname = NULL;
+	);
+
+	if (cfg->m_hostname == NULL) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	dberr_t	err = row_import_cfg_read_string(file, cfg->m_hostname, len);
+
+	if (err != DB_SUCCESS) {
+
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			(ulong) errno, strerror(errno),
+			"while parsing export hostname.");
+
+		return(err);
+	}
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_6",
+			(void) fseek(file, 0L, SEEK_END););
+
+	/* Read the table name of tablespace that was exported. */
+	if (fread(value, 1, sizeof(value), file) != sizeof(value)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			(ulong) errno, strerror(errno),
+			"while reading meta-data table name length.");
+
+		return(DB_IO_ERROR);
+	}
+
+	len = mach_read_from_4(value);
+
+	/* NUL byte is part of name length. */
+	cfg->m_table_name = UT_NEW_ARRAY_NOKEY(byte, len);
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF(
+		"ib_import_OOM_2",
+		UT_DELETE_ARRAY(cfg->m_table_name);
+		cfg->m_table_name = NULL;
+	);
+
+	if (cfg->m_table_name == NULL) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	err = row_import_cfg_read_string(file, cfg->m_table_name, len);
+
+	if (err != DB_SUCCESS) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			(ulong) errno, strerror(errno),
+			"while parsing table name.");
+
+		return(err);
+	}
+
+	ib::info() << "Importing tablespace for table '" << cfg->m_table_name
+		<< "' that was exported from host '" << cfg->m_hostname << "'";
+
+	byte		row[sizeof(ib_uint32_t) * 3];
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_7",
+			(void) fseek(file, 0L, SEEK_END););
+
+	/* Read the autoinc value. */
+	if (fread(row, 1, sizeof(ib_uint64_t), file) != sizeof(ib_uint64_t)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			(ulong) errno, strerror(errno),
+			"while reading autoinc value.");
+
+		return(DB_IO_ERROR);
+	}
+
+	cfg->m_autoinc = mach_read_from_8(row);
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_8",
+			(void) fseek(file, 0L, SEEK_END););
+
+	/* Read the tablespace page size. */
+	if (fread(row, 1, sizeof(row), file) != sizeof(row)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			(ulong) errno, strerror(errno),
+			"while reading meta-data header.");
+
+		return(DB_IO_ERROR);
+	}
+
+	byte*		ptr = row;
+
+	const ulint	logical_page_size = mach_read_from_4(ptr);
+	ptr += sizeof(ib_uint32_t);
+
+	if (logical_page_size != srv_page_size) {
+
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+			"Tablespace to be imported has a different"
+			" page size than this server. Server page size"
+			" is %lu, whereas tablespace page size"
+			" is " ULINTPF,
+			srv_page_size,
+			logical_page_size);
+
+		return(DB_ERROR);
+	}
+
+	cfg->m_flags = mach_read_from_4(ptr);
+	ptr += sizeof(ib_uint32_t);
+
+	cfg->m_zip_size = dict_tf_get_zip_size(cfg->m_flags);
+	cfg->m_n_cols = mach_read_from_4(ptr);
+
+	if (!dict_tf_is_valid(cfg->m_flags)) {
+		ib_errf(thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLE_SCHEMA_MISMATCH,
+			"Invalid table flags: " ULINTPF, cfg->m_flags);
+
+		return(DB_CORRUPTION);
+	}
+
+	err = row_import_read_columns(file, thd, cfg);
+
+	if (err == DB_SUCCESS) {
+		err = row_import_read_indexes(file, thd, cfg);
+	}
+
+	return(err);
+}
+
+/**
+Read the contents of the <tablespace>.cfg file.
+@return DB_SUCCESS or error code. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_read_meta_data(
+/*======================*/
+	FILE*		file,		/*!< in: File to read from */
+	THD*		thd,		/*!< in: session */
+	row_import&	cfg)		/*!< out: contents of the .cfg file */
+{
+	byte		row[sizeof(ib_uint32_t)];
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_9",
+			(void) fseek(file, 0L, SEEK_END););
+
+	if (fread(&row, 1, sizeof(row), file) != sizeof(row)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			(ulong) errno, strerror(errno),
+			"while reading meta-data version.");
+
+		return(DB_IO_ERROR);
+	}
+
+	cfg.m_version = mach_read_from_4(row);
+
+	/* Check the version number. */
+	switch (cfg.m_version) {
+	case IB_EXPORT_CFG_VERSION_V1:
+
+		return(row_import_read_v1(file, thd, &cfg));
+	default:
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			"Unsupported meta-data version number (" ULINTPF "), "
+			"file ignored", cfg.m_version);
+	}
+
+	return(DB_ERROR);
+}
+
+#define BTR_BLOB_HDR_PART_LEN 0 /*!< BLOB part len on this page */
+#define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /*!< next BLOB part page no,
+                                    FIL_NULL if none */
+#define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB part header, in bytes */
+
+/* decrypt and decompress page if needed */
+static dberr_t decrypt_decompress(fil_space_crypt_t *space_crypt,
+                                  uint32_t space_flags, span<byte> page,
+                                  uint32_t space_id, byte *page_compress_buf)
+{
+  auto *data= page.data();
+
+  if (space_crypt && space_crypt->should_encrypt())
+  {
+    if (!buf_page_verify_crypt_checksum(data, space_flags))
+      return DB_CORRUPTION;
+
+    if (dberr_t err= fil_space_decrypt(space_id, space_flags, space_crypt,
+                                       data, page.size(), data))
+      return err;
+  }
+
+  bool page_compressed= false;
+
+  if (fil_space_t::full_crc32(space_flags) &&
+      fil_space_t::is_compressed(space_flags))
+    page_compressed= buf_page_is_compressed(data, space_flags);
+  else
+  {
+    switch (fil_page_get_type(data)) {
+    case FIL_PAGE_PAGE_COMPRESSED:
+    case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED:
+      page_compressed= true;
+    }
+  }
+
+  if (page_compressed)
+  {
+    auto compress_length=
+      fil_page_decompress(page_compress_buf, data, space_flags);
+    ut_ad(compress_length != srv_page_size);
+
+    if (compress_length == 0)
+      return DB_CORRUPTION;
+  }
+
+  return DB_SUCCESS;
+}
+
+static size_t get_buf_size()
+{
+  return srv_page_size + (
+           provider_service_lzo->is_loaded ? LZO1X_1_15_MEM_COMPRESS :
+           provider_service_snappy->is_loaded ? snappy_max_compressed_length(srv_page_size) :
+           0
+         );
+}
+
+/* find, parse instant metadata, performing variaous checks,
+and apply it to dict_table_t
+@return DB_SUCCESS or some error */
+static dberr_t handle_instant_metadata(dict_table_t *table,
+                                       const row_import &cfg)
+{
+  dict_get_and_save_data_dir_path(table);
+
+  char *filepath;
+  if (DICT_TF_HAS_DATA_DIR(table->flags))
+  {
+    ut_a(table->data_dir_path);
+    filepath= fil_make_filepath(table->data_dir_path, table->name, IBD, true);
+  }
+  else
+    filepath= fil_make_filepath(nullptr, table->name, IBD, false);
+
+  if (!filepath)
+    return DB_OUT_OF_MEMORY;
+
+  SCOPE_EXIT([filepath]() { ut_free(filepath); });
+
+  bool success;
+  auto file= os_file_create_simple_no_error_handling(
+      innodb_data_file_key, filepath, OS_FILE_OPEN, OS_FILE_READ_WRITE, false,
+      &success);
+  if (!success)
+    return DB_IO_ERROR;
+
+  if (os_file_get_size(file) < srv_page_size)
+    return DB_CORRUPTION;
+
+  SCOPE_EXIT([&file]() { os_file_close(file); });
+
+  std::unique_ptr<byte[], decltype(&aligned_free)> first_page(
+      static_cast<byte *>(aligned_malloc(srv_page_size, srv_page_size)),
+      &aligned_free);
+
+  if (dberr_t err= os_file_read(IORequestReadPartial, file, first_page.get(),
+                                0, srv_page_size, nullptr))
+    return err;
+
+  auto space_flags= fsp_header_get_flags(first_page.get());
+
+  if (!fil_space_t::is_valid_flags(space_flags, true))
+  {
+    auto cflags= fsp_flags_convert_from_101(space_flags);
+    if (cflags == UINT32_MAX)
+      return invalid_space_flags(space_flags);
+    space_flags= static_cast<decltype(space_flags)>(cflags);
+  }
+
+  if (!cfg.m_missing)
+  {
+    if (dberr_t err= cfg.match_flags(current_thd))
+      return err;
+  }
+
+  const unsigned zip_size= fil_space_t::zip_size(space_flags);
+  const unsigned physical_size= zip_size ? zip_size : unsigned(srv_page_size);
+  ut_ad(physical_size <= UNIV_PAGE_SIZE_MAX);
+  const uint32_t space_id= page_get_space_id(first_page.get());
+
+  auto *space_crypt= fil_space_read_crypt_data(zip_size, first_page.get());
+  SCOPE_EXIT([&space_crypt]() {
+    if (space_crypt)
+      fil_space_destroy_crypt_data(&space_crypt);
+  });
+
+  std::unique_ptr<byte[], decltype(&aligned_free)> page(
+      static_cast<byte *>(
+          aligned_malloc(UNIV_PAGE_SIZE_MAX, UNIV_PAGE_SIZE_MAX)),
+      &aligned_free);
+
+  if (dberr_t err= os_file_read(
+          IORequestReadPartial, file, page.get(), 3 * physical_size,
+          physical_size, nullptr))
+    return err;
+
+  std::unique_ptr<byte[]> page_compress_buf(new byte[get_buf_size()]);
+
+  if (dberr_t err= decrypt_decompress(space_crypt, space_flags,
+                                      {page.get(), static_cast<size_t>
+                                       (physical_size)},
+                                      space_id, page_compress_buf.get()))
+    return err;
+
+  if (table->supports_instant())
+  {
+    dict_index_t *index= dict_table_get_first_index(table);
+
+    if (!page_is_comp(page.get()) != !dict_table_is_comp(table))
+    {
+      ib_errf(current_thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+              "ROW_FORMAT mismatch");
+      return DB_CORRUPTION;
+    }
+
+    if (btr_cur_instant_root_init(index, page.get()))
+      return DB_CORRUPTION;
+
+    ut_ad(index->n_core_null_bytes != dict_index_t::NO_CORE_NULL_BYTES);
+
+    if (fil_page_get_type(page.get()) == FIL_PAGE_INDEX)
+    {
+      ut_ad(!index->is_instant());
+      return DB_SUCCESS;
+    }
+
+    mem_heap_t *heap= NULL;
+    SCOPE_EXIT([&heap]() {
+      if (heap)
+        mem_heap_free(heap);
+    });
+
+    while (btr_page_get_level(page.get()) != 0)
+    {
+      const rec_t *rec= page_rec_get_next(page_get_infimum_rec(page.get()));
+      if (!rec)
+        return DB_CORRUPTION;
+
+      /* Relax the assertion in rec_init_offsets(). */
+      ut_ad(!index->in_instant_init);
+      ut_d(index->in_instant_init= true);
+      rec_offs *offsets=
+          rec_get_offsets(rec, index, nullptr, 0, ULINT_UNDEFINED, &heap);
+      ut_d(index->in_instant_init= false);
+
+      uint64_t child_page_no= btr_node_ptr_get_child_page_no(rec, offsets);
+
+      if (dberr_t err=
+          os_file_read(IORequestReadPartial, file, page.get(),
+                       child_page_no * physical_size, physical_size, nullptr))
+        return err;
+
+      if (dberr_t err= decrypt_decompress(space_crypt, space_flags,
+                                          {page.get(), static_cast<size_t>
+                                           (physical_size)}, space_id,
+                                          page_compress_buf.get()))
+        return err;
+    }
+
+    const auto *rec= page_rec_get_next_const(page_get_infimum_rec(page.get()));
+    const auto comp= dict_table_is_comp(index->table);
+
+    if (!rec || page_rec_is_supremum(rec))
+    {
+    corrupted_metadata:
+      ib::error() << "Table " << index->table->name
+                  << " is missing instant ALTER metadata";
+      index->table->corrupted= true;
+      return DB_CORRUPTION;
+    }
+
+    const auto info_bits= rec_get_info_bits(rec, comp);
+    if (!(info_bits & REC_INFO_MIN_REC_FLAG))
+      goto corrupted_metadata;
+
+    if ((info_bits & ~REC_INFO_DELETED_FLAG) != REC_INFO_MIN_REC_FLAG ||
+        (comp && rec_get_status(rec) != REC_STATUS_INSTANT))
+    {
+    incompatible:
+      ib::error() << "Table " << index->table->name
+                  << " contains unrecognizable instant ALTER metadata";
+      index->table->corrupted= true;
+      return DB_CORRUPTION;
+    }
+
+    if (info_bits & REC_INFO_DELETED_FLAG)
+    {
+      ulint trx_id_offset= index->trx_id_offset;
+      ut_ad(index->n_uniq);
+
+      if (trx_id_offset)
+      {
+      }
+      else if (index->table->not_redundant())
+      {
+
+        for (uint i= index->n_uniq; i--;)
+          trx_id_offset+= index->fields[i].fixed_len;
+      }
+      else if (rec_get_1byte_offs_flag(rec))
+      {
+        trx_id_offset= rec_1_get_field_end_info(rec, index->n_uniq - 1);
+        ut_ad(!(trx_id_offset & REC_1BYTE_SQL_NULL_MASK));
+        trx_id_offset&= ~REC_1BYTE_SQL_NULL_MASK;
+      }
+      else
+      {
+        trx_id_offset= rec_2_get_field_end_info(rec, index->n_uniq - 1);
+        ut_ad(!(trx_id_offset & REC_2BYTE_SQL_NULL_MASK));
+        trx_id_offset&= ~REC_2BYTE_SQL_NULL_MASK;
+      }
+
+      const byte *ptr=
+          rec + trx_id_offset + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+      if (mach_read_from_4(ptr + BTR_EXTERN_LEN))
+        goto incompatible;
+
+      uint len= mach_read_from_4(ptr + BTR_EXTERN_LEN + 4);
+      if (!len || mach_read_from_4(ptr + BTR_EXTERN_OFFSET) != FIL_PAGE_DATA)
+        goto incompatible;
+
+      std::unique_ptr<byte[], decltype(&aligned_free)>
+        second_page(static_cast<byte*>(aligned_malloc(physical_size,
+                                                      physical_size)),
+                    &aligned_free);
+
+      if (dberr_t err=
+          os_file_read(IORequestReadPartial, file, second_page.get(),
+                       physical_size *
+                       mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO),
+                       physical_size, nullptr))
+        return err;
+
+      if (dberr_t err= decrypt_decompress(space_crypt, space_flags,
+                                          {second_page.get(),
+                                           static_cast<size_t>(physical_size)},
+                                          space_id, page_compress_buf.get()))
+        return err;
+
+      if (fil_page_get_type(second_page.get()) != FIL_PAGE_TYPE_BLOB ||
+          mach_read_from_4(
+              &second_page[FIL_PAGE_DATA + BTR_BLOB_HDR_NEXT_PAGE_NO]) !=
+              FIL_NULL ||
+          mach_read_from_4(
+              &second_page[FIL_PAGE_DATA + BTR_BLOB_HDR_PART_LEN]) != len)
+        goto incompatible;
+
+      /* The unused part of the BLOB page should be zero-filled. */
+      for (const byte *
+               b= second_page.get() + (FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE) +
+                  len,
+              *const end= second_page.get() + srv_page_size - BTR_EXTERN_LEN;
+           b < end;)
+      {
+        if (*b++)
+          goto incompatible;
+      }
+
+      if (index->table->deserialise_columns(
+              &second_page[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE], len))
+        goto incompatible;
+    }
+
+    rec_offs *offsets= rec_get_offsets(
+        rec, index, nullptr, index->n_core_fields, ULINT_UNDEFINED, &heap);
+    if (rec_offs_any_default(offsets))
+    {
+    inconsistent:
+      goto incompatible;
+    }
+
+    /* In fact, because we only ever append fields to the metadata
+    record, it is also OK to perform READ UNCOMMITTED and
+    then ignore any extra fields, provided that
+    trx_sys.is_registered(DB_TRX_ID). */
+    if (rec_offs_n_fields(offsets) >
+            ulint(index->n_fields) + !!index->table->instant &&
+        !trx_sys.is_registered(current_trx(),
+                               row_get_rec_trx_id(rec, index, offsets)))
+      goto inconsistent;
+
+    for (unsigned i= index->n_core_fields; i < index->n_fields; i++)
+    {
+      dict_col_t *col= index->fields[i].col;
+      const unsigned o= i + !!index->table->instant;
+      ulint len;
+      const byte *data= rec_get_nth_field(rec, offsets, o, &len);
+      ut_ad(!col->is_added());
+      ut_ad(!col->def_val.data);
+      col->def_val.len= len;
+      switch (len) {
+      case UNIV_SQL_NULL:
+        continue;
+      case 0:
+        col->def_val.data= field_ref_zero;
+        continue;
+      }
+      ut_ad(len != UNIV_SQL_DEFAULT);
+      if (!rec_offs_nth_extern(offsets, o))
+        col->def_val.data= mem_heap_dup(index->table->heap, data, len);
+      else if (len < BTR_EXTERN_FIELD_REF_SIZE ||
+               !memcmp(data + len - BTR_EXTERN_FIELD_REF_SIZE, field_ref_zero,
+                       BTR_EXTERN_FIELD_REF_SIZE))
+      {
+        col->def_val.len= UNIV_SQL_DEFAULT;
+        goto inconsistent;
+      }
+      else
+      {
+        col->def_val.data= btr_copy_externally_stored_field(
+            &col->def_val.len, data, srv_page_size, len, index->table->heap);
+      }
+    }
+  }
+
+  return DB_SUCCESS;
+}
+
+/**
+Read the contents of the <tablename>.cfg file.
+@return DB_SUCCESS or error code. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_read_cfg(
+/*================*/
+	dict_table_t*	table,	/*!< in: table */
+	THD*		thd,	/*!< in: session */
+	row_import&	cfg)	/*!< out: contents of the .cfg file */
+{
+	dberr_t		err;
+	char		name[OS_FILE_MAX_PATH];
+
+	cfg.m_table = table;
+
+	srv_get_meta_data_filename(table, name, sizeof(name));
+
+	FILE*	file = fopen(name, "rb");
+
+	if (file == NULL) {
+		char	msg[BUFSIZ];
+
+		snprintf(msg, sizeof(msg),
+			 "Error opening '%s', will attempt to import"
+			 " without schema verification", name);
+
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_READ_ERROR,
+			(ulong) errno, strerror(errno), msg);
+
+		cfg.m_missing = true;
+
+		err = DB_FAIL;
+	} else {
+
+		cfg.m_missing = false;
+
+		err = row_import_read_meta_data(file, thd, cfg);
+		fclose(file);
+	}
+
+	return(err);
+}
+
+/** Update the root page numbers and tablespace ID of a table.
+@param[in,out]	trx	dictionary transaction
+@param[in,out]	table	persistent table
+@param[in]	reset	whether to reset the fields to FIL_NULL
+@return DB_SUCCESS or error code */
+dberr_t
+row_import_update_index_root(trx_t* trx, dict_table_t* table, bool reset)
+{
+	const dict_index_t*	index;
+	que_t*			graph = 0;
+	dberr_t			err = DB_SUCCESS;
+
+	ut_ad(reset || table->space->id == table->space_id);
+
+	static const char	sql[] = {
+		"PROCEDURE UPDATE_INDEX_ROOT() IS\n"
+		"BEGIN\n"
+		"UPDATE SYS_INDEXES\n"
+		"SET SPACE = :space,\n"
+		"    PAGE_NO = :page,\n"
+		"    TYPE = :type\n"
+		"WHERE TABLE_ID = :table_id AND ID = :index_id;\n"
+		"END;\n"};
+
+	table->def_trx_id = trx->id;
+
+	for (index = dict_table_get_first_index(table);
+	     index != 0;
+	     index = dict_table_get_next_index(index)) {
+
+		pars_info_t*	info;
+		ib_uint32_t	page;
+		ib_uint32_t	space;
+		ib_uint32_t	type;
+		index_id_t	index_id;
+		table_id_t	table_id;
+
+		info = (graph != 0) ? graph->info : pars_info_create();
+
+		mach_write_to_4(
+			reinterpret_cast<byte*>(&type),
+			index->type);
+
+		mach_write_to_4(
+			reinterpret_cast<byte*>(&page),
+			reset ? FIL_NULL : index->page);
+
+		mach_write_to_4(
+			reinterpret_cast<byte*>(&space),
+			reset ? FIL_NULL : index->table->space_id);
+
+		mach_write_to_8(
+			reinterpret_cast<byte*>(&index_id),
+			index->id);
+
+		mach_write_to_8(
+			reinterpret_cast<byte*>(&table_id),
+			table->id);
+
+		/* If we set the corrupt bit during the IMPORT phase then
+		we need to update the system tables. */
+		pars_info_bind_int4_literal(info, "type", &type);
+		pars_info_bind_int4_literal(info, "space", &space);
+		pars_info_bind_int4_literal(info, "page", &page);
+		pars_info_bind_ull_literal(info, "index_id", &index_id);
+		pars_info_bind_ull_literal(info, "table_id", &table_id);
+
+		if (graph == 0) {
+			graph = pars_sql(info, sql);
+			ut_a(graph);
+			graph->trx = trx;
+		}
+
+		que_thr_t*	thr;
+
+		ut_a(thr = que_fork_start_command(graph));
+
+		que_run_threads(thr);
+
+		DBUG_EXECUTE_IF("ib_import_internal_error",
+				trx->error_state = DB_ERROR;);
+
+		err = trx->error_state;
+
+		if (err != DB_SUCCESS) {
+			ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+				ER_INTERNAL_ERROR,
+				"While updating the <space, root page"
+				" number> of index %s - %s",
+				index->name(), ut_strerr(err));
+
+			break;
+		}
+	}
+
+	que_graph_free(graph);
+
+	return(err);
+}
+
+/** Callback arg for row_import_set_discarded. */
+struct discard_t {
+	ib_uint32_t	flags2;			/*!< Value read from column */
+	bool		state;			/*!< New state of the flag */
+	ulint		n_recs;			/*!< Number of recs processed */
+};
+
+/******************************************************************//**
+Fetch callback that sets or unsets the DISCARDED tablespace flag in
+SYS_TABLES. The flags is stored in MIX_LEN column.
+@return FALSE if all OK */
+static
+ibool
+row_import_set_discarded(
+/*=====================*/
+	void*		row,			/*!< in: sel_node_t* */
+	void*		user_arg)		/*!< in: bool set/unset flag */
+{
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	discard_t*	discard = static_cast<discard_t*>(user_arg);
+	dfield_t*	dfield = que_node_get_val(node->select_list);
+	dtype_t*	type = dfield_get_type(dfield);
+	ulint		len = dfield_get_len(dfield);
+
+	ut_a(dtype_get_mtype(type) == DATA_INT);
+	ut_a(len == sizeof(ib_uint32_t));
+
+	ulint	flags2 = mach_read_from_4(
+		static_cast<byte*>(dfield_get_data(dfield)));
+
+	if (discard->state) {
+		flags2 |= DICT_TF2_DISCARDED;
+	} else {
+		flags2 &= ~DICT_TF2_DISCARDED;
+	}
+
+	mach_write_to_4(reinterpret_cast<byte*>(&discard->flags2), flags2);
+
+	++discard->n_recs;
+
+	/* There should be at most one matching record. */
+	ut_a(discard->n_recs == 1);
+
+	return(FALSE);
+}
+
+/** Update the DICT_TF2_DISCARDED flag in SYS_TABLES.MIX_LEN.
+@param[in,out]	trx		dictionary transaction
+@param[in]	table_id	table identifier
+@param[in]	discarded	whether to set or clear the flag
+@return DB_SUCCESS or error code */
+dberr_t row_import_update_discarded_flag(trx_t* trx, table_id_t table_id,
+					 bool discarded)
+{
+	pars_info_t*		info;
+	discard_t		discard;
+
+	static const char	sql[] =
+		"PROCEDURE UPDATE_DISCARDED_FLAG() IS\n"
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS\n"
+		" SELECT MIX_LEN"
+		" FROM SYS_TABLES"
+		" WHERE ID = :table_id FOR UPDATE;"
+		"\n"
+		"BEGIN\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"UPDATE SYS_TABLES"
+		" SET MIX_LEN = :flags2"
+		" WHERE ID = :table_id;\n"
+		"CLOSE c;\n"
+		"END;\n";
+
+	discard.n_recs = 0;
+	discard.state = discarded;
+	discard.flags2 = ULINT32_UNDEFINED;
+
+	info = pars_info_create();
+
+	pars_info_add_ull_literal(info, "table_id", table_id);
+	pars_info_bind_int4_literal(info, "flags2", &discard.flags2);
+
+	pars_info_bind_function(
+		info, "my_func", row_import_set_discarded, &discard);
+
+	dberr_t	err = que_eval_sql(info, sql, trx);
+
+	ut_a(discard.n_recs == 1);
+	ut_a(discard.flags2 != ULINT32_UNDEFINED);
+
+	return(err);
+}
+
+/** InnoDB writes page by page when there is page compressed
+tablespace involved. It does help to save the disk space when
+punch hole is enabled
+@param iter     Tablespace iterator
+@param full_crc32    whether the file is in the full_crc32 format
+@param offset   offset of the file to be written
+@param writeptr buffer to be written
+@param n_bytes  number of bytes to be written
+@param try_punch_only   Try the range punch only because the
+                        current range is full of empty pages
+@return DB_SUCCESS */
+static
+dberr_t fil_import_compress_fwrite(const fil_iterator_t &iter,
+                                   bool full_crc32,
+                                   os_offset_t offset,
+                                   const byte *writeptr,
+                                   ulint n_bytes,
+                                   bool try_punch_only= false)
+{
+  if (dberr_t err= os_file_punch_hole(iter.file, offset, n_bytes))
+    return err;
+
+  if (try_punch_only)
+    return DB_SUCCESS;
+
+  for (ulint j= 0; j < n_bytes; j+= srv_page_size)
+  {
+    /* Read the original data length from block and
+    safer to read FIL_PAGE_COMPRESSED_SIZE because it
+    is not encrypted*/
+    ulint n_write_bytes= srv_page_size;
+    if (j || offset)
+    {
+      n_write_bytes= mach_read_from_2(writeptr + j + FIL_PAGE_DATA);
+      const unsigned ptype= mach_read_from_2(writeptr + j + FIL_PAGE_TYPE);
+      /* Ignore the empty page */
+      if (ptype == 0 && n_write_bytes == 0)
+        continue;
+      if (full_crc32)
+        n_write_bytes= buf_page_full_crc32_size(writeptr + j,
+                                                nullptr, nullptr);
+      else
+      {
+        n_write_bytes+= ptype == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED
+          ? FIL_PAGE_DATA + FIL_PAGE_ENCRYPT_COMP_METADATA_LEN
+          : FIL_PAGE_DATA + FIL_PAGE_COMP_METADATA_LEN;
+      }
+    }
+
+    if (dberr_t err= os_file_write(IORequestWrite, iter.filepath, iter.file,
+                                   writeptr + j, offset + j, n_write_bytes))
+      return err;
+  }
+
+  return DB_SUCCESS;
+}
+
+dberr_t FetchIndexRootPages::run(const fil_iterator_t& iter,
+                                 buf_block_t* block) UNIV_NOTHROW
+{
+  const unsigned zip_size= fil_space_t::zip_size(m_space_flags);
+  const unsigned size= zip_size ? zip_size : unsigned(srv_page_size);
+  byte* page_compress_buf= static_cast<byte*>(malloc(get_buf_size()));
+  const bool full_crc32 = fil_space_t::full_crc32(m_space_flags);
+  bool skip_checksum_check = false;
+  ut_ad(!srv_read_only_mode);
+
+  if (!page_compress_buf)
+    return DB_OUT_OF_MEMORY;
+
+  const bool encrypted= iter.crypt_data != NULL &&
+    iter.crypt_data->should_encrypt();
+  byte* const readptr= iter.io_buffer;
+  block->page.frame= readptr;
+
+  if (block->page.zip.data)
+    block->page.zip.data= readptr;
+
+  bool page_compressed= false;
+
+  dberr_t err= os_file_read(IORequestReadPartial, iter.file, readptr,
+                            3 * size, size, nullptr);
+  if (err != DB_SUCCESS)
+  {
+    ib::error() << iter.filepath << ": os_file_read() failed";
+    goto func_exit;
+  }
+
+  if (page_get_page_no(readptr) != 3)
+  {
+page_corrupted:
+    ib::warn() << filename() << ": Page 3 at offset "
+               << 3 * size << " looks corrupted.";
+    err= DB_CORRUPTION;
+    goto func_exit;
+  }
+
+  block->page.id_.set_page_no(3);
+  if (full_crc32 && fil_space_t::is_compressed(m_space_flags))
+    page_compressed= buf_page_is_compressed(readptr, m_space_flags);
+  else
+  {
+    switch (fil_page_get_type(readptr)) {
+    case FIL_PAGE_PAGE_COMPRESSED:
+    case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED:
+      if (block->page.zip.data)
+        goto page_corrupted;
+      page_compressed= true;
+    }
+  }
+
+  if (encrypted)
+  {
+    if (!buf_page_verify_crypt_checksum(readptr, m_space_flags))
+      goto page_corrupted;
+
+    if ((err= fil_space_decrypt(get_space_id(), m_space_flags, iter.crypt_data,
+                                readptr, size, readptr)))
+      goto func_exit;
+  }
+
+  /* For full_crc32 format, skip checksum check
+  after decryption. */
+  skip_checksum_check= full_crc32 && encrypted;
+
+  if (page_compressed)
+  {
+    ulint compress_length= fil_page_decompress(page_compress_buf,
+                                               readptr,
+                                               m_space_flags);
+    ut_ad(compress_length != srv_page_size);
+    if (compress_length == 0)
+      goto page_corrupted;
+  }
+  else if (!skip_checksum_check
+           && buf_page_is_corrupted(false, readptr, m_space_flags))
+    goto page_corrupted;
+
+  err= this->operator()(block);
+func_exit:
+  free(page_compress_buf);
+  return err;
+}
+
+static dberr_t fil_iterate(
+	const fil_iterator_t&	iter,
+	buf_block_t*		block,
+	AbstractCallback&	callback)
+{
+	os_offset_t		offset;
+	const ulint		size = callback.physical_size();
+	ulint			n_bytes = iter.n_io_buffers * size;
+
+	byte* page_compress_buf= static_cast<byte*>(malloc(get_buf_size()));
+	ut_ad(!srv_read_only_mode);
+
+	if (!page_compress_buf) {
+		return DB_OUT_OF_MEMORY;
+	}
+
+	uint32_t actual_space_id = 0;
+	const bool full_crc32 = fil_space_t::full_crc32(
+		callback.get_space_flags());
+
+	/* TODO: For ROW_FORMAT=COMPRESSED tables we do a lot of useless
+	copying for non-index pages. Unfortunately, it is
+	required by buf_zip_decompress() */
+	dberr_t		err = DB_SUCCESS;
+	bool		page_compressed = false;
+	bool		punch_hole = !my_test_if_thinly_provisioned(iter.file);
+
+	for (offset = iter.start; offset < iter.end; offset += n_bytes) {
+		if (callback.is_interrupted()) {
+			err = DB_INTERRUPTED;
+			goto func_exit;
+		}
+
+		byte*		io_buffer = iter.io_buffer;
+		block->page.frame = io_buffer;
+
+		if (block->page.zip.data) {
+			/* Zip IO is done in the compressed page buffer. */
+			io_buffer = block->page.zip.data;
+		}
+
+		/* We have to read the exact number of bytes. Otherwise the
+		InnoDB IO functions croak on failed reads. */
+
+		n_bytes = ulint(ut_min(os_offset_t(n_bytes),
+				       iter.end - offset));
+
+		ut_ad(n_bytes > 0);
+		ut_ad(!(n_bytes % size));
+
+		const bool encrypted = iter.crypt_data != NULL
+			&& iter.crypt_data->should_encrypt();
+		/* Use additional crypt io buffer if tablespace is encrypted */
+		byte* const readptr = encrypted
+			? iter.crypt_io_buffer : io_buffer;
+		byte* const writeptr = readptr;
+
+		err = os_file_read(IORequestReadPartial, iter.file, readptr,
+				   offset, n_bytes, nullptr);
+		if (err != DB_SUCCESS) {
+			ib::error() << iter.filepath
+				    << ": os_file_read() failed";
+			goto func_exit;
+		}
+
+		bool		updated = false;
+		os_offset_t	page_off = offset;
+		ulint		n_pages_read = n_bytes / size;
+		/* This block is not attached to buf_pool */
+		block->page.id_.set_page_no(uint32_t(page_off / size));
+
+		for (ulint i = 0; i < n_pages_read;
+		     ++block->page.id_,
+		     ++i, page_off += size, block->page.frame += size) {
+			byte*	src = readptr + i * size;
+			const ulint page_no = page_get_page_no(src);
+			if (!page_no && block->page.id().page_no()) {
+				if (!buf_is_zeroes(span<const byte>(src,
+								    size))) {
+					goto page_corrupted;
+				}
+				/* Proceed to the next page,
+				because this one is all zero. */
+				continue;
+			}
+
+			if (page_no != block->page.id().page_no()) {
+page_corrupted:
+				ib::warn() << callback.filename()
+					   << ": Page " << (offset / size)
+					   << " at offset " << offset
+					   << " looks corrupted.";
+				err = DB_CORRUPTION;
+				goto func_exit;
+			}
+
+			if (block->page.id().page_no() == 0) {
+				actual_space_id = mach_read_from_4(
+					src + FIL_PAGE_SPACE_ID);
+			}
+
+			const uint16_t type = fil_page_get_type(src);
+			page_compressed =
+				(full_crc32
+				 && fil_space_t::is_compressed(
+					callback.get_space_flags())
+				 && buf_page_is_compressed(
+					src, callback.get_space_flags()))
+				|| type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED
+				|| type == FIL_PAGE_PAGE_COMPRESSED;
+
+			if (page_compressed && block->page.zip.data) {
+				goto page_corrupted;
+			}
+
+			bool decrypted = false;
+			byte* dst = io_buffer + i * size;
+			bool frame_changed = false;
+			uint key_version = buf_page_get_key_version(
+				src, callback.get_space_flags());
+
+			if (!encrypted) {
+			} else if (!key_version) {
+				if (block->page.id().page_no() == 0
+				    && block->page.zip.data) {
+					block->page.zip.data = src;
+					frame_changed = true;
+				} else if (!page_compressed
+					   && type != FIL_PAGE_TYPE_XDES
+					   && !block->page.zip.data) {
+					block->page.frame = src;
+					frame_changed = true;
+				} else {
+					ut_ad(dst != src);
+					memcpy(dst, src, size);
+				}
+			} else {
+				if (!buf_page_verify_crypt_checksum(
+					src, callback.get_space_flags())) {
+					goto page_corrupted;
+				}
+
+				if ((err = fil_space_decrypt(
+					actual_space_id,
+					callback.get_space_flags(),
+					iter.crypt_data, dst,
+					callback.physical_size(),
+					src))) {
+					goto func_exit;
+				}
+
+				decrypted = true;
+				updated = true;
+			}
+
+			/* For full_crc32 format, skip checksum check
+			after decryption. */
+			bool skip_checksum_check = full_crc32 && encrypted;
+
+			/* If the original page is page_compressed, we need
+			to decompress it before adjusting further. */
+			if (page_compressed) {
+				ulint compress_length = fil_page_decompress(
+					page_compress_buf, dst,
+					callback.get_space_flags());
+				ut_ad(compress_length != srv_page_size);
+				if (compress_length == 0) {
+					goto page_corrupted;
+				}
+				updated = true;
+			} else if (!skip_checksum_check
+				   && buf_page_is_corrupted(
+					   false,
+					   encrypted && !frame_changed
+					   ? dst : src,
+					   callback.get_space_flags())) {
+				goto page_corrupted;
+			}
+
+			if ((err = callback(block)) != DB_SUCCESS) {
+				goto func_exit;
+			} else if (!updated) {
+				updated = !!block->page.frame;
+			}
+
+			/* If tablespace is encrypted we use additional
+			temporary scratch area where pages are read
+			for decrypting readptr == crypt_io_buffer != io_buffer.
+
+			Destination for decryption is a buffer pool block
+			block->page.frame == dst == io_buffer that is updated.
+			Pages that did not require decryption even when
+			tablespace is marked as encrypted are not copied
+			instead block->page.frame is set to src == readptr.
+
+			For encryption we again use temporary scratch area
+			writeptr != io_buffer == dst
+			that is then written to the tablespace
+
+			(1) For normal tables io_buffer == dst == writeptr
+			(2) For only page compressed tables
+			io_buffer == dst == writeptr
+			(3) For encrypted (and page compressed)
+			readptr != io_buffer == dst != writeptr
+			*/
+
+			ut_ad(!encrypted && !page_compressed ?
+			      src == dst && dst == writeptr + (i * size):1);
+			ut_ad(page_compressed && !encrypted ?
+			      src == dst && dst == writeptr + (i * size):1);
+			ut_ad(encrypted ?
+			      src != dst && dst != writeptr + (i * size):1);
+
+			/* When tablespace is encrypted or compressed its
+			first page (i.e. page 0) is not encrypted or
+			compressed and there is no need to copy frame. */
+			if (encrypted && block->page.id().page_no() != 0) {
+				byte *local_frame = callback.get_frame(block);
+				ut_ad((writeptr + (i * size)) != local_frame);
+				memcpy((writeptr + (i * size)), local_frame, size);
+			}
+
+			if (frame_changed) {
+				if (block->page.zip.data) {
+					block->page.zip.data = dst;
+				} else {
+					block->page.frame = dst;
+				}
+			}
+
+			src =  io_buffer + (i * size);
+
+			if (page_compressed) {
+				updated = true;
+				if (ulint len = fil_page_compress(
+					    src,
+					    page_compress_buf,
+					    callback.get_space_flags(),
+					    512,/* FIXME: proper block size */
+					    encrypted)) {
+					/* FIXME: remove memcpy() */
+					memcpy(src, page_compress_buf, len);
+					memset(src + len, 0,
+					       srv_page_size - len);
+				}
+			}
+
+			/* Encrypt the page if encryption was used. */
+			if (encrypted && decrypted) {
+				byte *dest = writeptr + i * size;
+
+				byte* tmp = fil_encrypt_buf(
+					iter.crypt_data,
+					block->page.id().space(),
+					block->page.id().page_no(),
+					src, block->zip_size(), dest,
+					full_crc32);
+
+				if (tmp == src) {
+					/* TODO: remove unnecessary memcpy's */
+					ut_ad(dest != src);
+					memcpy(dest, src, size);
+				}
+
+				updated = true;
+			}
+
+			/* Write checksum for the compressed full crc32 page.*/
+			if (full_crc32 && page_compressed) {
+				ut_ad(updated);
+				byte* dest = writeptr + i * size;
+				ut_d(bool comp = false);
+				ut_d(bool corrupt = false);
+				ulint size = buf_page_full_crc32_size(
+					dest,
+#ifdef UNIV_DEBUG
+					&comp, &corrupt
+#else
+					NULL, NULL
+#endif
+				);
+				ut_ad(!comp == (size == srv_page_size));
+				ut_ad(!corrupt);
+				mach_write_to_4(dest + (size - 4),
+						my_crc32c(0, dest, size - 4));
+			}
+		}
+
+		if (page_compressed && punch_hole) {
+			err = fil_import_compress_fwrite(
+				iter, full_crc32, offset, writeptr, n_bytes,
+				!updated);
+
+			if (err != DB_SUCCESS) {
+				punch_hole = false;
+				if (updated) {
+					goto normal_write;
+				}
+			}
+		} else if (updated) {
+normal_write:
+			/* A page was updated in the set, write it back. */
+			err = os_file_write(IORequestWrite,
+					    iter.filepath, iter.file,
+					    writeptr, offset, n_bytes);
+
+			if (err != DB_SUCCESS) {
+				goto func_exit;
+			}
+		}
+	}
+
+func_exit:
+	free(page_compress_buf);
+	return err;
+}
+
+/********************************************************************//**
+Iterate over all the pages in the tablespace.
+@param table - the table definiton in the server
+@param n_io_buffers - number of blocks to read and write together
+@param callback - functor that will do the page updates
+@return	DB_SUCCESS or error code */
+static
+dberr_t
+fil_tablespace_iterate(
+/*===================*/
+	dict_table_t*		table,
+	ulint			n_io_buffers,
+	AbstractCallback&	callback)
+{
+	dberr_t		err;
+	pfs_os_file_t	file;
+	char*		filepath;
+
+	ut_a(n_io_buffers > 0);
+	ut_ad(!srv_read_only_mode);
+
+	DBUG_EXECUTE_IF("ib_import_trigger_corruption_1",
+			return(DB_CORRUPTION););
+
+	/* Make sure the data_dir_path is set. */
+	dict_get_and_save_data_dir_path(table);
+
+	ut_ad(!DICT_TF_HAS_DATA_DIR(table->flags) || table->data_dir_path);
+
+	const char *data_dir_path = DICT_TF_HAS_DATA_DIR(table->flags)
+		? table->data_dir_path : nullptr;
+
+	filepath = fil_make_filepath(data_dir_path,
+				     {table->name.m_name,
+				      strlen(table->name.m_name)},
+				     IBD, data_dir_path != nullptr);
+	if (!filepath) {
+		return(DB_OUT_OF_MEMORY);
+	} else {
+		bool	success;
+
+		file = os_file_create_simple_no_error_handling(
+			innodb_data_file_key, filepath,
+			OS_FILE_OPEN, OS_FILE_READ_WRITE, false, &success);
+
+		if (!success) {
+			/* The following call prints an error message */
+			os_file_get_last_error(true);
+			ib::error() << "Trying to import a tablespace,"
+				" but could not open the tablespace file "
+				    << filepath;
+			ut_free(filepath);
+			return DB_TABLESPACE_NOT_FOUND;
+		} else {
+			err = DB_SUCCESS;
+		}
+	}
+
+	callback.set_file(filepath, file);
+
+	os_offset_t	file_size = os_file_get_size(file);
+	ut_a(file_size != (os_offset_t) -1);
+
+	/* Allocate a page to read in the tablespace header, so that we
+	can determine the page size and zip_size (if it is compressed).
+	We allocate an extra page in case it is a compressed table. */
+
+	byte*	page = static_cast<byte*>(aligned_malloc(2 * srv_page_size,
+							 srv_page_size));
+
+	buf_block_t* block = reinterpret_cast<buf_block_t*>
+		(ut_zalloc_nokey(sizeof *block));
+	block->page.frame = page;
+	block->page.init(buf_page_t::UNFIXED + 1, page_id_t{~0ULL});
+
+	/* Read the first page and determine the page size. */
+
+	err = os_file_read(IORequestReadPartial, file, page, 0, srv_page_size,
+			   nullptr);
+
+	if (err == DB_SUCCESS) {
+		err = callback.init(file_size, block);
+	}
+
+	if (err == DB_SUCCESS) {
+		block->page.id_ = page_id_t(callback.get_space_id(), 0);
+		if (ulint zip_size = callback.get_zip_size()) {
+			page_zip_set_size(&block->page.zip, zip_size);
+			/* ROW_FORMAT=COMPRESSED is not optimised for block IO
+			for now. We do the IMPORT page by page. */
+			n_io_buffers = 1;
+		}
+
+		fil_iterator_t	iter;
+
+		/* read (optional) crypt data */
+		iter.crypt_data = fil_space_read_crypt_data(
+			callback.get_zip_size(), page);
+
+		/* If tablespace is encrypted, it needs extra buffers */
+		if (iter.crypt_data && n_io_buffers > 1) {
+			/* decrease io buffers so that memory
+			consumption will not double */
+			n_io_buffers /= 2;
+		}
+
+		iter.file = file;
+		iter.start = 0;
+		iter.end = file_size;
+		iter.filepath = filepath;
+		iter.file_size = file_size;
+		iter.n_io_buffers = n_io_buffers;
+
+		/* Add an extra page for compressed page scratch area. */
+		iter.io_buffer = static_cast<byte*>(
+			aligned_malloc((1 + iter.n_io_buffers)
+				       << srv_page_size_shift, srv_page_size));
+
+		iter.crypt_io_buffer = iter.crypt_data
+			? static_cast<byte*>(
+				aligned_malloc((1 + iter.n_io_buffers)
+					       << srv_page_size_shift,
+					       srv_page_size))
+			: NULL;
+
+		if (block->page.zip.ssize) {
+			ut_ad(iter.n_io_buffers == 1);
+			block->page.frame = iter.io_buffer;
+			block->page.zip.data = block->page.frame
+				+ srv_page_size;
+		}
+
+		err = callback.run(iter, block);
+
+		if (iter.crypt_data) {
+			fil_space_destroy_crypt_data(&iter.crypt_data);
+		}
+
+		aligned_free(iter.crypt_io_buffer);
+		aligned_free(iter.io_buffer);
+	}
+
+	if (err == DB_SUCCESS) {
+		ib::info() << "Sync to disk";
+
+		if (!os_file_flush(file)) {
+			ib::info() << "os_file_flush() failed!";
+			err = DB_IO_ERROR;
+		} else {
+			ib::info() << "Sync to disk - done!";
+		}
+	}
+
+	os_file_close(file);
+
+	aligned_free(page);
+	ut_free(filepath);
+	ut_free(block);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return error code or DB_SUCCESS */
+dberr_t
+row_import_for_mysql(
+/*=================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL */
+{
+	dberr_t		err;
+	ib_uint64_t	autoinc = 0;
+	char*		filepath = NULL;
+	trx_t*		trx = prebuilt->trx;
+
+	/* The caller assured that this is not read_only_mode and that no
+	temorary tablespace is being imported. */
+	ut_ad(!srv_read_only_mode);
+	ut_ad(!table->is_temporary());
+
+	ut_ad(table->space_id);
+	ut_ad(table->space_id < SRV_SPACE_ID_UPPER_BOUND);
+	ut_ad(trx);
+	ut_ad(trx->state == TRX_STATE_ACTIVE);
+	ut_ad(!table->is_readable());
+
+	ibuf_delete_for_discarded_space(table->space_id);
+
+	/* Assign an undo segment for the transaction, so that the
+	transaction will be recovered after a crash. */
+
+	/* TODO: Do not write any undo log for the IMPORT cleanup. */
+	{
+		mtr_t mtr;
+		mtr.start();
+		trx_undo_assign(trx, &err, &mtr);
+		mtr.commit();
+	}
+
+	DBUG_EXECUTE_IF("ib_import_undo_assign_failure",
+			err = DB_TOO_MANY_CONCURRENT_TRXS;);
+
+	if (err == DB_SUCCESS && !trx->has_logged_persistent()) {
+		err = DB_TOO_MANY_CONCURRENT_TRXS;
+	}
+	if (err != DB_SUCCESS) {
+		return row_import_cleanup(prebuilt, err);
+	}
+
+	trx->op_info = "read meta-data file";
+
+	row_import	cfg;
+	THD* thd = trx->mysql_thd;
+
+	err = row_import_read_cfg(table, thd, cfg);
+
+	/* Check if the table column definitions match the contents
+	of the config file. */
+
+	if (err == DB_SUCCESS) {
+
+		if (dberr_t err = handle_instant_metadata(table, cfg)) {
+			return row_import_error(prebuilt, err);
+		}
+
+		/* We have a schema file, try and match it with our
+		data dictionary. */
+
+		err = cfg.match_schema(thd);
+
+		/* Update index->page and SYS_INDEXES.PAGE_NO to match the
+		B-tree root page numbers in the tablespace. Use the index
+		name from the .cfg file to find match. */
+
+		if (err == DB_SUCCESS) {
+			cfg.set_root_by_name();
+			autoinc = cfg.m_autoinc;
+		}
+
+		DBUG_EXECUTE_IF("ib_import_set_index_root_failure",
+				err = DB_TOO_MANY_CONCURRENT_TRXS;);
+
+	} else if (cfg.m_missing) {
+		/* We don't have a schema file, we will have to discover
+		the index root pages from the .ibd file and skip the schema
+		matching step. */
+
+		ut_a(err == DB_FAIL);
+
+		cfg.m_zip_size = 0;
+
+		if (UT_LIST_GET_LEN(table->indexes) > 1) {
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				ER_INTERNAL_ERROR,
+				"Drop all secondary indexes before importing "
+				"table %s when .cfg file is missing.",
+				table->name.m_name);
+			err = DB_ERROR;
+			return row_import_error(prebuilt, err);
+		}
+
+		FetchIndexRootPages	fetchIndexRootPages(table, trx);
+
+		err = fil_tablespace_iterate(
+			table, IO_BUFFER_SIZE(srv_page_size),
+			fetchIndexRootPages);
+
+		if (err == DB_SUCCESS) {
+
+			err = fetchIndexRootPages.build_row_import(&cfg);
+
+			/* Update index->page and SYS_INDEXES.PAGE_NO
+			to match the B-tree root page numbers in the
+			tablespace. */
+
+			if (err == DB_SUCCESS) {
+				err = cfg.set_root_by_heuristic();
+
+				if (err == DB_SUCCESS) {
+					err = handle_instant_metadata(table,
+								      cfg);
+				}
+			}
+		}
+	}
+
+	if (err != DB_SUCCESS) {
+		return row_import_error(prebuilt, err);
+	}
+
+	trx->op_info = "importing tablespace";
+
+	ib::info() << "Phase I - Update all pages";
+
+	/* Iterate over all the pages and do the sanity checking and
+	the conversion required to import the tablespace. */
+
+	PageConverter	converter(&cfg, table->space_id, trx);
+
+	/* Set the IO buffer size in pages. */
+
+	err = fil_tablespace_iterate(
+		table, IO_BUFFER_SIZE(cfg.m_zip_size ? cfg.m_zip_size
+				      : srv_page_size), converter);
+
+	DBUG_EXECUTE_IF("ib_import_reset_space_and_lsn_failure",
+			err = DB_TOO_MANY_CONCURRENT_TRXS;);
+#ifdef BTR_CUR_HASH_ADAPT
+	/* On DISCARD TABLESPACE, we did not drop any adaptive hash
+	index entries. If we replaced the discarded tablespace with a
+	smaller one here, there could still be some adaptive hash
+	index entries that point to cached garbage pages in the buffer
+	pool, because PageConverter::operator() only evicted those
+	pages that were replaced by the imported pages. We must
+	detach any remaining adaptive hash index entries, because the
+	adaptive hash index must be a subset of the table contents;
+	false positives are not tolerated. */
+	for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); index;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+		index = index->clone_if_needed();
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	if (err != DB_SUCCESS) {
+		char	table_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			table_name, sizeof(table_name),
+			table->name.m_name);
+
+		if (err != DB_DECRYPTION_FAILED) {
+
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				ER_INTERNAL_ERROR,
+			"Error importing tablespace for table %s : %s",
+				table_name, ut_strerr(err));
+		}
+
+		return row_import_cleanup(prebuilt, err);
+	}
+
+	/* If the table is stored in a remote tablespace, we need to
+	determine that filepath from the link file and system tables.
+	Find the space ID in SYS_TABLES since this is an ALTER TABLE. */
+	dict_get_and_save_data_dir_path(table);
+
+	ut_ad(!DICT_TF_HAS_DATA_DIR(table->flags) || table->data_dir_path);
+	const char *data_dir_path = DICT_TF_HAS_DATA_DIR(table->flags)
+		? table->data_dir_path : nullptr;
+	fil_space_t::name_type name{
+		table->name.m_name, strlen(table->name.m_name)};
+
+	filepath = fil_make_filepath(data_dir_path, name, IBD,
+				     data_dir_path != nullptr);
+
+	DBUG_EXECUTE_IF(
+		"ib_import_OOM_15",
+		ut_free(filepath);
+		filepath = NULL;
+	);
+
+	if (filepath == NULL) {
+		return row_import_cleanup(prebuilt, DB_OUT_OF_MEMORY);
+	}
+
+	/* Open the tablespace so that we can access via the buffer pool.
+	The tablespace is initially opened as a temporary one, because
+	we will not be writing any redo log for it before we have invoked
+	fil_space_t::set_imported() to declare it a persistent tablespace. */
+
+	table->space = fil_ibd_open(
+		2, FIL_TYPE_IMPORT, table->space_id,
+		dict_tf_to_fsp_flags(table->flags), name, filepath, &err);
+
+	ut_ad((table->space == NULL) == (err != DB_SUCCESS));
+	DBUG_EXECUTE_IF("ib_import_open_tablespace_failure",
+			err = DB_TABLESPACE_NOT_FOUND; table->space = NULL;);
+
+	if (!table->space) {
+		ib_senderrf(thd, IB_LOG_LEVEL_ERROR,
+			ER_GET_ERRMSG,
+			err, ut_strerr(err), filepath);
+	}
+
+	ut_free(filepath);
+
+	if (err == DB_SUCCESS) {
+		err = ibuf_check_bitmap_on_import(trx, table->space);
+	}
+
+	DBUG_EXECUTE_IF("ib_import_check_bitmap_failure", err = DB_CORRUPTION;);
+
+	if (err != DB_SUCCESS) {
+		return row_import_cleanup(prebuilt, err);
+	}
+
+	/* The first index must always be the clustered index. */
+
+	dict_index_t*	index = dict_table_get_first_index(table);
+
+	if (!dict_index_is_clust(index)) {
+		return row_import_error(prebuilt, DB_CORRUPTION);
+	}
+
+	/* Update the Btree segment headers for index node and
+	leaf nodes in the root page. Set the new space id. */
+
+	err = btr_root_adjust_on_import(index);
+
+	DBUG_EXECUTE_IF("ib_import_cluster_root_adjust_failure",
+			err = DB_CORRUPTION;);
+
+	if (err != DB_SUCCESS) {
+		return row_import_error(prebuilt, err);
+	} else if (cfg.requires_purge(index->name)) {
+
+		/* Purge any delete-marked records that couldn't be
+		purged during the page conversion phase from the
+		cluster index. */
+
+		IndexPurge	purge(trx, index);
+
+		trx->op_info = "cluster: purging delete marked records";
+
+		err = purge.garbage_collect();
+
+		trx->op_info = "";
+	}
+
+	DBUG_EXECUTE_IF("ib_import_cluster_failure", err = DB_CORRUPTION;);
+
+	if (err != DB_SUCCESS) {
+		return row_import_error(prebuilt, err);
+	}
+
+	/* For secondary indexes, purge any records that couldn't be purged
+	during the page conversion phase. */
+
+	err = row_import_adjust_root_pages_of_secondary_indexes(
+		trx, table, cfg);
+
+	DBUG_EXECUTE_IF("ib_import_sec_root_adjust_failure",
+			err = DB_CORRUPTION;);
+
+	if (err != DB_SUCCESS) {
+		return row_import_error(prebuilt, err);
+	}
+
+	/* Ensure that the next available DB_ROW_ID is not smaller than
+	any DB_ROW_ID stored in the table. */
+
+	if (prebuilt->clust_index_was_generated) {
+		row_import_set_sys_max_row_id(prebuilt, table);
+	}
+
+	ib::info() << "Phase III - Flush changes to disk";
+
+	/* Ensure that all pages dirtied during the IMPORT make it to disk.
+	The only dirty pages generated should be from the pessimistic purge
+	of delete marked records that couldn't be purged in Phase I. */
+	while (buf_flush_list_space(prebuilt->table->space));
+
+	for (ulint count = 0; prebuilt->table->space->referenced(); count++) {
+		/* Issue a warning every 10.24 seconds, starting after
+		2.56 seconds */
+		if ((count & 511) == 128) {
+			ib::warn() << "Waiting for flush to complete on "
+				   << prebuilt->table->name;
+		}
+		std::this_thread::sleep_for(std::chrono::milliseconds(20));
+	}
+
+	ib::info() << "Phase IV - Flush complete";
+	prebuilt->table->space->set_imported();
+
+	/* The dictionary latches will be released in in row_import_cleanup()
+	after the transaction commit, for both success and error. */
+
+	row_mysql_lock_data_dictionary(trx);
+
+	/* Update the root pages of the table's indexes. */
+	err = row_import_update_index_root(trx, table, false);
+
+	if (err != DB_SUCCESS) {
+		return row_import_error(prebuilt, err);
+	}
+
+	err = row_import_update_discarded_flag(trx, table->id, false);
+
+	if (err != DB_SUCCESS) {
+		return row_import_error(prebuilt, err);
+	}
+
+	table->file_unreadable = false;
+	table->flags2 &= ~DICT_TF2_DISCARDED & ((1U << DICT_TF2_BITS) - 1);
+
+	/* Set autoinc value read from .cfg file, if one was specified.
+	Otherwise, keep the PAGE_ROOT_AUTO_INC as is. */
+	if (autoinc) {
+		ib::info() << table->name << " autoinc value set to "
+			<< autoinc;
+
+		table->autoinc = autoinc--;
+		btr_write_autoinc(dict_table_get_first_index(table), autoinc);
+	}
+
+	return row_import_cleanup(prebuilt, err);
+}
diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc
new file mode 100644
index 00000000..bdee0ed1
--- /dev/null
+++ b/storage/innobase/row/row0ins.cc
@@ -0,0 +1,3843 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ins.cc
+Insert into a table
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0ins.h"
+#include "dict0dict.h"
+#include "trx0rec.h"
+#include "trx0undo.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "mach0data.h"
+#include "ibuf0ibuf.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0sel.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "eval0eval.h"
+#include "data0data.h"
+#include "buf0lru.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#ifdef BTR_CUR_HASH_ADAPT
+# include "btr0sea.h"
+#endif
+#ifdef WITH_WSREP
+#include <wsrep.h>
+#include <mysql/service_wsrep.h>
+#include "ha_prototypes.h"
+#endif /* WITH_WSREP */
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/** Create an row template for each index of a table. */
+static void ins_node_create_entry_list(ins_node_t *node)
+{
+  node->entry_list.reserve(UT_LIST_GET_LEN(node->table->indexes));
+
+  for (dict_index_t *index= dict_table_get_first_index(node->table); index;
+       index= dict_table_get_next_index(index))
+  {
+    /* Corrupted or incomplete secondary indexes will be filtered out in
+    row_ins(). */
+    dtuple_t *entry= index->online_status >= ONLINE_INDEX_ABORTED
+      ? dtuple_create(node->entry_sys_heap, 0)
+      : row_build_index_entry_low(node->row, NULL, index, node->entry_sys_heap,
+				  ROW_BUILD_FOR_INSERT);
+    node->entry_list.push_back(entry);
+  }
+}
+
+/*****************************************************************//**
+Adds system field buffers to a row. */
+static
+void
+row_ins_alloc_sys_fields(
+/*=====================*/
+	ins_node_t*	node)	/*!< in: insert node */
+{
+	dtuple_t*		row;
+	dict_table_t*		table;
+	const dict_col_t*	col;
+	dfield_t*		dfield;
+
+	row = node->row;
+	table = node->table;
+
+	ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(table));
+
+	/* allocate buffer to hold the needed system created hidden columns. */
+	compile_time_assert(DATA_ROW_ID_LEN
+			    + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN
+			    == sizeof node->sys_buf);
+	memset(node->sys_buf, 0, sizeof node->sys_buf);
+	/* Assign DB_ROLL_PTR to 1 << ROLL_PTR_INSERT_FLAG_POS */
+	node->sys_buf[DATA_ROW_ID_LEN + DATA_TRX_ID_LEN] = 0x80;
+	ut_ad(!memcmp(node->sys_buf + DATA_ROW_ID_LEN, reset_trx_id,
+		      sizeof reset_trx_id));
+
+	/* 1. Populate row-id */
+	col = dict_table_get_sys_col(table, DATA_ROW_ID);
+
+	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+	dfield_set_data(dfield, node->sys_buf, DATA_ROW_ID_LEN);
+
+	/* 2. Populate trx id */
+	col = dict_table_get_sys_col(table, DATA_TRX_ID);
+
+	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+	dfield_set_data(dfield, &node->sys_buf[DATA_ROW_ID_LEN],
+			DATA_TRX_ID_LEN);
+
+	col = dict_table_get_sys_col(table, DATA_ROLL_PTR);
+
+	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+	dfield_set_data(dfield, &node->sys_buf[DATA_ROW_ID_LEN
+					       + DATA_TRX_ID_LEN],
+			DATA_ROLL_PTR_LEN);
+}
+
+/*********************************************************************//**
+Sets a new row to insert for an INS_DIRECT node. This function is only used
+if we have constructed the row separately, which is a rare case; this
+function is quite slow. */
+void
+ins_node_set_new_row(
+/*=================*/
+	ins_node_t*	node,	/*!< in: insert node */
+	dtuple_t*	row)	/*!< in: new row (or first row) for the node */
+{
+	node->state = INS_NODE_SET_IX_LOCK;
+	node->index = NULL;
+	node->entry_list.clear();
+	node->entry = node->entry_list.end();
+
+	node->row = row;
+
+	mem_heap_empty(node->entry_sys_heap);
+
+	/* Create templates for index entries */
+
+	ins_node_create_entry_list(node);
+
+	/* Allocate from entry_sys_heap buffers for sys fields */
+
+	row_ins_alloc_sys_fields(node);
+
+	/* As we allocated a new trx id buf, the trx id should be written
+	there again: */
+
+	node->trx_id = 0;
+}
+
+/*******************************************************************//**
+Does an insert operation by updating a delete-marked existing record
+in the index. This situation can occur if the delete-marked record is
+kept in the index for consistent reads.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_sec_index_entry_by_modify(
+/*==============================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_INSERT_TREE,
+				depending on whether mtr holds just a leaf
+				latch or also a tree latch */
+	btr_cur_t*	cursor,	/*!< in: B-tree cursor */
+	rec_offs**	offsets,/*!< in/out: offsets on cursor->page_cur.rec */
+	mem_heap_t*	offsets_heap,
+				/*!< in/out: memory heap that can be emptied */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
+	const dtuple_t*	entry,	/*!< in: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr; must be committed before
+				latching any further pages */
+{
+	big_rec_t*	dummy_big_rec;
+	upd_t*		update;
+	rec_t*		rec;
+	dberr_t		err;
+
+	rec = btr_cur_get_rec(cursor);
+
+	ut_ad(!cursor->index()->is_clust());
+	ut_ad(rec_offs_validate(rec, cursor->index(), *offsets));
+	ut_ad(!entry->info_bits);
+
+	/* We know that in the alphabetical ordering, entry and rec are
+	identified. But in their binary form there may be differences if
+	there are char fields in them. Therefore we have to calculate the
+	difference. */
+
+	update = row_upd_build_sec_rec_difference_binary(
+		rec, cursor->index(), *offsets, entry, heap);
+
+	if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
+		/* We should never insert in place of a record that
+		has not been delete-marked. The only exception is when
+		online CREATE INDEX copied the changes that we already
+		made to the clustered index, and completed the
+		secondary index creation before we got here. In this
+		case, the change would already be there. The CREATE
+		INDEX should be in wait_while_table_is_used() at least
+		until this INSERT or UPDATE returns. After that point,
+		set_committed(true) would be invoked in
+		commit_inplace_alter_table(). */
+		ut_a(update->n_fields == 0);
+		ut_ad(!dict_index_is_online_ddl(cursor->index()));
+		return cursor->index()->is_committed()
+			? DB_CORRUPTION : DB_SUCCESS;
+	}
+
+	if (mode == BTR_MODIFY_LEAF) {
+		/* Try an optimistic updating of the record, keeping changes
+		within the page */
+
+		/* TODO: pass only *offsets */
+		err = btr_cur_optimistic_update(
+			flags | BTR_KEEP_SYS_FLAG, cursor,
+			offsets, &offsets_heap, update, 0, thr,
+			thr_get_trx(thr)->id, mtr);
+		switch (err) {
+		case DB_OVERFLOW:
+		case DB_UNDERFLOW:
+		case DB_ZIP_OVERFLOW:
+			err = DB_FAIL;
+		default:
+			break;
+		}
+	} else {
+		ut_ad(mode == BTR_INSERT_TREE);
+		if (buf_pool.running_out()) {
+
+			return(DB_LOCK_TABLE_FULL);
+		}
+
+		err = btr_cur_pessimistic_update(
+			flags | BTR_KEEP_SYS_FLAG, cursor,
+			offsets, &offsets_heap,
+			heap, &dummy_big_rec, update, 0,
+			thr, thr_get_trx(thr)->id, mtr);
+		ut_ad(!dummy_big_rec);
+	}
+
+	return(err);
+}
+
+/*******************************************************************//**
+Does an insert operation by delete unmarking and updating a delete marked
+existing record in the index. This situation can occur if the delete marked
+record is kept in the index for consistent reads.
+@return DB_SUCCESS, DB_FAIL, or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_clust_index_entry_by_modify(
+/*================================*/
+	btr_pcur_t*	pcur,	/*!< in/out: a persistent cursor pointing
+				to the clust_rec that is being modified. */
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether mtr holds just a leaf
+				latch or also a tree latch */
+	rec_offs**	offsets,/*!< out: offsets on cursor->page_cur.rec */
+	mem_heap_t**	offsets_heap,
+				/*!< in/out: pointer to memory heap that can
+				be emptied, or NULL */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
+	const dtuple_t*	entry,	/*!< in: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr; must be committed before
+				latching any further pages */
+{
+	const rec_t*	rec;
+	upd_t*		update;
+	dberr_t		err = DB_SUCCESS;
+	btr_cur_t*	cursor	= btr_pcur_get_btr_cur(pcur);
+	TABLE*		mysql_table = NULL;
+	ut_ad(cursor->index()->is_clust());
+
+	rec = btr_cur_get_rec(cursor);
+
+	ut_ad(rec_get_deleted_flag(rec,
+				   cursor->index()->table->not_redundant()));
+	/* In delete-marked records, DB_TRX_ID must
+	always refer to an existing undo log record. */
+	ut_ad(rec_get_trx_id(rec, cursor->index()));
+
+	/* Build an update vector containing all the fields to be modified;
+	NOTE that this vector may NOT contain system columns trx_id or
+	roll_ptr */
+	if (thr->prebuilt != NULL) {
+		mysql_table = thr->prebuilt->m_mysql_table;
+		ut_ad(thr->prebuilt->trx == thr_get_trx(thr));
+	}
+
+	update = row_upd_build_difference_binary(
+		cursor->index(), entry, rec, NULL, true, true,
+		thr_get_trx(thr), heap, mysql_table, &err);
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	if (mode != BTR_MODIFY_TREE) {
+		ut_ad(mode == BTR_MODIFY_LEAF
+		      || mode == BTR_MODIFY_LEAF_ALREADY_LATCHED
+		      || mode == BTR_MODIFY_ROOT_AND_LEAF
+		      || mode == BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED);
+
+		/* Try optimistic updating of the record, keeping changes
+		within the page */
+
+		err = btr_cur_optimistic_update(
+			flags, cursor, offsets, offsets_heap, update, 0, thr,
+			thr_get_trx(thr)->id, mtr);
+		switch (err) {
+		case DB_OVERFLOW:
+		case DB_UNDERFLOW:
+		case DB_ZIP_OVERFLOW:
+			err = DB_FAIL;
+		default:
+			break;
+		}
+	} else {
+		if (buf_pool.running_out()) {
+			return DB_LOCK_TABLE_FULL;
+		}
+
+		big_rec_t*	big_rec	= NULL;
+
+		err = btr_cur_pessimistic_update(
+			flags | BTR_KEEP_POS_FLAG,
+			cursor, offsets, offsets_heap, heap,
+			&big_rec, update, 0, thr, thr_get_trx(thr)->id, mtr);
+
+		if (big_rec) {
+			ut_a(err == DB_SUCCESS);
+
+			DEBUG_SYNC_C("before_row_ins_upd_extern");
+			err = btr_store_big_rec_extern_fields(
+				pcur, *offsets, big_rec, mtr,
+				BTR_STORE_INSERT_UPDATE);
+			DEBUG_SYNC_C("after_row_ins_upd_extern");
+			dtuple_big_rec_free(big_rec);
+		}
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Returns TRUE if in a cascaded update/delete an ancestor node of node
+updates (not DELETE, but UPDATE) table.
+@return TRUE if an ancestor updates table */
+static
+ibool
+row_ins_cascade_ancestor_updates_table(
+/*===================================*/
+	que_node_t*	node,	/*!< in: node in a query graph */
+	dict_table_t*	table)	/*!< in: table */
+{
+	que_node_t*	parent;
+
+	for (parent = que_node_get_parent(node);
+	     que_node_get_type(parent) == QUE_NODE_UPDATE;
+	     parent = que_node_get_parent(parent)) {
+
+		upd_node_t*	upd_node;
+
+		upd_node = static_cast<upd_node_t*>(parent);
+
+		if (upd_node->table == table && !upd_node->is_delete) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Returns the number of ancestor UPDATE or DELETE nodes of a
+cascaded update/delete node.
+@return number of ancestors */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+ulint
+row_ins_cascade_n_ancestors(
+/*========================*/
+	que_node_t*	node)	/*!< in: node in a query graph */
+{
+	que_node_t*	parent;
+	ulint		n_ancestors = 0;
+
+	for (parent = que_node_get_parent(node);
+	     que_node_get_type(parent) == QUE_NODE_UPDATE;
+	     parent = que_node_get_parent(parent)) {
+
+		n_ancestors++;
+	}
+
+	return(n_ancestors);
+}
+
+/******************************************************************//**
+Calculates the update vector node->cascade->update for a child table in
+a cascaded update.
+@return whether any FULLTEXT INDEX is affected */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_ins_cascade_calc_update_vec(
+/*============================*/
+	upd_node_t*	node,		/*!< in: update node of the parent
+					table */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint whose
+					type is != 0 */
+	mem_heap_t*	heap,		/*!< in: memory heap to use as
+					temporary storage */
+	trx_t*		trx)		/*!< in: update transaction */
+{
+	upd_node_t*     cascade         = node->cascade_node;
+	dict_table_t*	table		= foreign->foreign_table;
+	dict_index_t*	index		= foreign->foreign_index;
+	upd_t*		update;
+	dict_table_t*	parent_table;
+	dict_index_t*	parent_index;
+	upd_t*		parent_update;
+	ulint		n_fields_updated;
+	ulint		parent_field_no;
+	ulint		i;
+	ulint		j;
+	bool		doc_id_updated = false;
+	unsigned	doc_id_pos = 0;
+	doc_id_t	new_doc_id = FTS_NULL_DOC_ID;
+	ulint		prefix_col;
+
+	ut_a(cascade);
+	ut_a(table);
+	ut_a(index);
+
+	/* Calculate the appropriate update vector which will set the fields
+	in the child index record to the same value (possibly padded with
+	spaces if the column is a fixed length CHAR or FIXBINARY column) as
+	the referenced index record will get in the update. */
+
+	parent_table = node->table;
+	ut_a(parent_table == foreign->referenced_table);
+	parent_index = foreign->referenced_index;
+	parent_update = node->update;
+
+	update = cascade->update;
+
+	update->info_bits = 0;
+
+	n_fields_updated = 0;
+
+	bool affects_fulltext = foreign->affects_fulltext();
+
+	if (table->fts) {
+		doc_id_pos = dict_table_get_nth_col_pos(
+			table, table->fts->doc_col, &prefix_col);
+	}
+
+	for (i = 0; i < foreign->n_fields; i++) {
+
+		parent_field_no = dict_table_get_nth_col_pos(
+			parent_table,
+			dict_index_get_nth_col_no(parent_index, i),
+			&prefix_col);
+
+		for (j = 0; j < parent_update->n_fields; j++) {
+			const upd_field_t*	parent_ufield
+				= &parent_update->fields[j];
+
+			if (parent_ufield->field_no == parent_field_no) {
+
+				ulint			min_size;
+				const dict_col_t*	col;
+				ulint			ufield_len;
+				upd_field_t*		ufield;
+
+				col = dict_index_get_nth_col(index, i);
+
+				/* A field in the parent index record is
+				updated. Let us make the update vector
+				field for the child table. */
+
+				ufield = update->fields + n_fields_updated;
+
+				ufield->field_no = static_cast<uint16_t>(
+					dict_table_get_nth_col_pos(
+						table, dict_col_get_no(col),
+						&prefix_col));
+
+				ufield->orig_len = 0;
+				ufield->exp = NULL;
+
+				ufield->new_val = parent_ufield->new_val;
+				dfield_get_type(&ufield->new_val)->prtype |=
+					col->prtype & DATA_VERSIONED;
+				ufield_len = dfield_get_len(&ufield->new_val);
+
+				/* Clear the "external storage" flag */
+				dfield_set_len(&ufield->new_val, ufield_len);
+
+				/* Do not allow a NOT NULL column to be
+				updated as NULL */
+
+				if (dfield_is_null(&ufield->new_val)
+				    && (col->prtype & DATA_NOT_NULL)) {
+					goto err_exit;
+				}
+
+				/* If the new value would not fit in the
+				column, do not allow the update */
+
+				if (!dfield_is_null(&ufield->new_val)
+				    && dtype_get_at_most_n_mbchars(
+					col->prtype,
+					col->mbminlen, col->mbmaxlen,
+					col->len,
+					ufield_len,
+					static_cast<char*>(
+						dfield_get_data(
+							&ufield->new_val)))
+				    < ufield_len) {
+					goto err_exit;
+				}
+
+				/* If the parent column type has a different
+				length than the child column type, we may
+				need to pad with spaces the new value of the
+				child column */
+
+				min_size = dict_col_get_min_size(col);
+
+				/* Because UNIV_SQL_NULL (the marker
+				of SQL NULL values) exceeds all possible
+				values of min_size, the test below will
+				not hold for SQL NULL columns. */
+
+				if (min_size > ufield_len) {
+
+					byte*	pad;
+					ulint	pad_len;
+					byte*	padded_data;
+					ulint	mbminlen;
+
+					padded_data = static_cast<byte*>(
+						mem_heap_alloc(
+							heap, min_size));
+
+					pad = padded_data + ufield_len;
+					pad_len = min_size - ufield_len;
+
+					memcpy(padded_data,
+					       dfield_get_data(&ufield
+							       ->new_val),
+					       ufield_len);
+
+					mbminlen = dict_col_get_mbminlen(col);
+
+					ut_ad(!(ufield_len % mbminlen));
+					ut_ad(!(min_size % mbminlen));
+
+					if (mbminlen == 1
+					    && dtype_get_charset_coll(
+						    col->prtype)
+					    == DATA_MYSQL_BINARY_CHARSET_COLL) {
+						/* Do not pad BINARY columns */
+						goto err_exit;
+					}
+
+					row_mysql_pad_col(mbminlen,
+							  pad, pad_len);
+					dfield_set_data(&ufield->new_val,
+							padded_data, min_size);
+				}
+
+				/* If Doc ID is updated, check whether the
+				Doc ID is valid */
+				if (table->fts
+				    && ufield->field_no == doc_id_pos) {
+					doc_id_t	n_doc_id;
+
+					n_doc_id =
+						table->fts->cache->next_doc_id;
+
+					new_doc_id = fts_read_doc_id(
+						static_cast<const byte*>(
+							dfield_get_data(
+							&ufield->new_val)));
+
+					affects_fulltext = true;
+					doc_id_updated = true;
+
+					if (new_doc_id <= 0) {
+						ib::error() << "FTS Doc ID"
+							" must be larger than"
+							" 0";
+						goto err_exit;
+					}
+
+					if (new_doc_id < n_doc_id) {
+						ib::error() << "FTS Doc ID"
+							" must be larger than "
+							<< n_doc_id - 1
+							<< " for table "
+							<< table->name;
+						goto err_exit;
+					}
+				}
+
+				n_fields_updated++;
+			}
+		}
+	}
+
+	if (affects_fulltext) {
+		ut_ad(table->fts);
+
+		if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+			doc_id_t	doc_id;
+			doc_id_t*	next_doc_id;
+			upd_field_t*	ufield;
+
+			next_doc_id = static_cast<doc_id_t*>(mem_heap_alloc(
+				heap, sizeof(doc_id_t)));
+
+			ut_ad(!doc_id_updated);
+			ufield = update->fields + n_fields_updated;
+			fts_get_next_doc_id(table, next_doc_id);
+			doc_id = fts_update_doc_id(table, ufield, next_doc_id);
+			n_fields_updated++;
+			fts_trx_add_op(trx, table, doc_id, FTS_INSERT, NULL);
+		} else  {
+			if (doc_id_updated) {
+				ut_ad(new_doc_id);
+				fts_trx_add_op(trx, table, new_doc_id,
+					       FTS_INSERT, NULL);
+			} else {
+				ib::error() << "FTS Doc ID must be updated"
+					" along with FTS indexed column for"
+					" table " << table->name;
+err_exit:
+				n_fields_updated = ULINT_UNDEFINED;
+			}
+		}
+	}
+
+	update->n_fields = n_fields_updated;
+
+	return affects_fulltext;
+}
+
+/*********************************************************************//**
+Set detailed error message associated with foreign key errors for
+the given transaction. */
+static
+void
+row_ins_set_detailed(
+/*=================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_foreign_t*	foreign)	/*!< in: foreign key constraint */
+{
+	ut_ad(!srv_read_only_mode);
+
+	mysql_mutex_lock(&srv_misc_tmpfile_mutex);
+	rewind(srv_misc_tmpfile);
+
+	if (os_file_set_eof(srv_misc_tmpfile)) {
+		ut_print_name(srv_misc_tmpfile, trx,
+			      foreign->foreign_table_name);
+		std::string fk_str = dict_print_info_on_foreign_key_in_create_format(
+			trx, foreign, FALSE);
+		fputs(fk_str.c_str(), srv_misc_tmpfile);
+		trx_set_detailed_error_from_file(trx, srv_misc_tmpfile);
+	} else {
+		trx_set_detailed_error(trx, "temp file operation failed");
+	}
+
+	mysql_mutex_unlock(&srv_misc_tmpfile_mutex);
+}
+
+/*********************************************************************//**
+Acquires dict_foreign_err_mutex, rewinds dict_foreign_err_file
+and displays information about the given transaction.
+The caller must release dict_foreign_err_mutex. */
+TRANSACTIONAL_TARGET
+static
+void
+row_ins_foreign_trx_print(
+/*======================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	ulint	n_rec_locks;
+	ulint	n_trx_locks;
+	ulint	heap_size;
+
+	ut_ad(!srv_read_only_mode);
+
+	{
+		TMLockMutexGuard g{SRW_LOCK_CALL};
+		n_rec_locks = trx->lock.n_rec_locks;
+		n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
+		heap_size = mem_heap_get_size(trx->lock.lock_heap);
+	}
+
+	mysql_mutex_lock(&dict_foreign_err_mutex);
+	rewind(dict_foreign_err_file);
+	ut_print_timestamp(dict_foreign_err_file);
+	fputs(" Transaction:\n", dict_foreign_err_file);
+
+	trx_print_low(dict_foreign_err_file, trx, 600,
+		      n_rec_locks, n_trx_locks, heap_size);
+
+	mysql_mutex_assert_owner(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Reports a foreign key error associated with an update or a delete of a
+parent table index entry. */
+static
+void
+row_ins_foreign_report_err(
+/*=======================*/
+	const char*	errstr,		/*!< in: error string from the viewpoint
+					of the parent table */
+	que_thr_t*	thr,		/*!< in: query thread whose run_node
+					is an update node */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint */
+	const rec_t*	rec,		/*!< in: a matching index record in the
+					child table */
+	const dtuple_t*	entry)		/*!< in: index entry in the parent
+					table */
+{
+	std::string fk_str;
+
+	if (srv_read_only_mode) {
+		return;
+	}
+
+	FILE*	ef	= dict_foreign_err_file;
+	trx_t*	trx	= thr_get_trx(thr);
+
+	row_ins_set_detailed(trx, foreign);
+
+	row_ins_foreign_trx_print(trx);
+
+	fputs("Foreign key constraint fails for table ", ef);
+	ut_print_name(ef, trx, foreign->foreign_table_name);
+	fputs(":\n", ef);
+	fk_str = dict_print_info_on_foreign_key_in_create_format(trx, foreign,
+							TRUE);
+	fputs(fk_str.c_str(), ef);
+	putc('\n', ef);
+	fputs(errstr, ef);
+	fprintf(ef, " in parent table, in index %s",
+		foreign->referenced_index->name());
+	if (entry) {
+		fputs(" tuple:\n", ef);
+		dtuple_print(ef, entry);
+	}
+	fputs("\nBut in child table ", ef);
+	ut_print_name(ef, trx, foreign->foreign_table_name);
+	fprintf(ef, ", in index %s", foreign->foreign_index->name());
+	if (rec) {
+		fputs(", there is a record:\n", ef);
+		rec_print(ef, rec, foreign->foreign_index);
+	} else {
+		fputs(", the record is not available\n", ef);
+	}
+	putc('\n', ef);
+
+	mysql_mutex_unlock(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Reports a foreign key error to dict_foreign_err_file when we are trying
+to add an index entry to a child table. Note that the adding may be the result
+of an update, too. */
+static
+void
+row_ins_foreign_report_add_err(
+/*===========================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint */
+	const rec_t*	rec,		/*!< in: a record in the parent table:
+					it does not match entry because we
+					have an error! */
+	const dtuple_t*	entry)		/*!< in: index entry to insert in the
+					child table */
+{
+	std::string fk_str;
+
+	if (srv_read_only_mode) {
+		return;
+	}
+
+	FILE*	ef	= dict_foreign_err_file;
+
+	row_ins_set_detailed(trx, foreign);
+
+	row_ins_foreign_trx_print(trx);
+
+	fputs("Foreign key constraint fails for table ", ef);
+	ut_print_name(ef, trx, foreign->foreign_table_name);
+	fputs(":\n", ef);
+	fk_str = dict_print_info_on_foreign_key_in_create_format(trx, foreign,
+							TRUE);
+	fputs(fk_str.c_str(), ef);
+	if (foreign->foreign_index) {
+		fprintf(ef, " in parent table, in index %s",
+			foreign->foreign_index->name());
+	} else {
+		fputs(" in parent table", ef);
+	}
+	if (entry) {
+		fputs(" tuple:\n", ef);
+		/* TODO: DB_TRX_ID and DB_ROLL_PTR may be uninitialized.
+		It would be better to only display the user columns. */
+		dtuple_print(ef, entry);
+	}
+	fputs("\nBut in parent table ", ef);
+	ut_print_name(ef, trx, foreign->referenced_table_name);
+	fprintf(ef, ", in index %s,\n"
+		"the closest match we can find is record:\n",
+		foreign->referenced_index->name());
+	if (rec && page_rec_is_supremum(rec)) {
+		/* If the cursor ended on a supremum record, it is better
+		to report the previous record in the error message, so that
+		the user gets a more descriptive error message. */
+		rec = page_rec_get_prev_const(rec);
+	}
+
+	if (rec) {
+		rec_print(ef, rec, foreign->referenced_index);
+	}
+	putc('\n', ef);
+
+	mysql_mutex_unlock(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Invalidate the query cache for the given table. */
+static
+void
+row_ins_invalidate_query_cache(
+/*===========================*/
+	que_thr_t*	thr,		/*!< in: query thread whose run_node
+					is an update node */
+	const char*	name)		/*!< in: table name prefixed with
+					database name and a '/' character */
+{
+	innobase_invalidate_query_cache(thr_get_trx(thr), name);
+}
+
+/** Fill virtual column information in cascade node for the child table.
+@param[out]	cascade		child update node
+@param[in]	rec		clustered rec of child table
+@param[in]	index		clustered index of child table
+@param[in]	node		parent update node
+@param[in]	foreign		foreign key information
+@return		error code. */
+static
+dberr_t
+row_ins_foreign_fill_virtual(
+	upd_node_t*		cascade,
+	const rec_t*		rec,
+	dict_index_t*		index,
+	upd_node_t*		node,
+	dict_foreign_t*		foreign)
+{
+	THD*		thd = current_thd;
+	row_ext_t*	ext;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets_);
+	const rec_offs*	offsets =
+		rec_get_offsets(rec, index, offsets_, index->n_core_fields,
+				ULINT_UNDEFINED, &cascade->heap);
+	TABLE*		mysql_table= NULL;
+	upd_t*		update = cascade->update;
+	ulint		n_v_fld = index->table->n_v_def;
+	ulint		n_diff;
+	upd_field_t*	upd_field;
+	dict_vcol_set*	v_cols = foreign->v_cols;
+	update->old_vrow = row_build(
+		ROW_COPY_DATA, index, rec,
+		offsets, index->table, NULL, NULL,
+		&ext, update->heap);
+	n_diff = update->n_fields;
+
+	ut_ad(index->table->vc_templ != NULL);
+
+	ib_vcol_row vc(NULL);
+	uchar *record = vc.record(thd, index, &mysql_table);
+	if (!record) {
+		return DB_OUT_OF_MEMORY;
+	}
+	ut_ad(!node->is_delete
+	      || (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL));
+	ut_ad(foreign->type & (DICT_FOREIGN_ON_DELETE_SET_NULL
+			       | DICT_FOREIGN_ON_UPDATE_SET_NULL
+			       | DICT_FOREIGN_ON_UPDATE_CASCADE));
+
+	for (uint16_t i = 0; i < n_v_fld; i++) {
+
+		dict_v_col_t*     col = dict_table_get_nth_v_col(
+				index->table, i);
+
+		dict_vcol_set::iterator it = v_cols->find(col);
+
+		if (it == v_cols->end()) {
+			continue;
+		}
+
+		dfield_t*	vfield = innobase_get_computed_value(
+				update->old_vrow, col, index,
+				&vc.heap, update->heap, NULL, thd, mysql_table,
+				record, NULL, NULL);
+
+		if (vfield == NULL) {
+			return DB_COMPUTE_VALUE_FAILED;
+		}
+
+		upd_field = update->fields + n_diff;
+
+		upd_field->old_v_val = static_cast<dfield_t*>(
+			mem_heap_alloc(update->heap,
+				       sizeof *upd_field->old_v_val));
+
+		dfield_copy(upd_field->old_v_val, vfield);
+
+		upd_field_set_v_field_no(upd_field, i, index);
+
+		dfield_t* new_vfield = innobase_get_computed_value(
+				update->old_vrow, col, index,
+				&vc.heap, update->heap, NULL, thd,
+				mysql_table, record, NULL,
+				update);
+
+		if (new_vfield == NULL) {
+			return DB_COMPUTE_VALUE_FAILED;
+		}
+
+		dfield_copy(&upd_field->new_val, new_vfield);
+
+		if (!dfield_datas_are_binary_equal(
+				upd_field->old_v_val,
+				&upd_field->new_val, 0))
+			n_diff++;
+	}
+
+	update->n_fields = n_diff;
+	return DB_SUCCESS;
+}
+
+#ifdef WITH_WSREP
+dberr_t wsrep_append_foreign_key(trx_t *trx,
+			       dict_foreign_t*	foreign,
+			       const rec_t*	clust_rec,
+			       dict_index_t*	clust_index,
+			       bool		referenced,
+			       upd_node_t*	upd_node,
+			       bool		pa_disable,
+			       Wsrep_service_key_type	key_type);
+#endif /* WITH_WSREP */
+
+/*********************************************************************//**
+Perform referential actions or checks when a parent row is deleted or updated
+and the constraint had an ON DELETE or ON UPDATE condition which was not
+RESTRICT.
+@return DB_SUCCESS, DB_LOCK_WAIT, or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_foreign_check_on_constraint(
+/*================================*/
+	que_thr_t*	thr,		/*!< in: query thread whose run_node
+					is an update node */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint whose
+					type is != 0 */
+	btr_pcur_t*	pcur,		/*!< in: cursor placed on a matching
+					index record in the child table */
+	dtuple_t*	entry,		/*!< in: index entry in the parent
+					table */
+	mtr_t*		mtr)		/*!< in: mtr holding the latch of pcur
+					page */
+{
+	upd_node_t*	node;
+	upd_node_t*	cascade;
+	dict_table_t*const*const fktable = &foreign->foreign_table;
+	dict_table_t*	table = *fktable;
+	dict_index_t*	index;
+	dict_index_t*	clust_index;
+	dtuple_t*	ref;
+	const rec_t*	rec;
+	const rec_t*	clust_rec;
+	const buf_block_t* clust_block;
+	upd_t*		update;
+	dberr_t		err;
+	trx_t*		trx;
+	mem_heap_t*	tmp_heap	= NULL;
+	doc_id_t	doc_id = FTS_NULL_DOC_ID;
+
+	DBUG_ENTER("row_ins_foreign_check_on_constraint");
+
+	trx = thr_get_trx(thr);
+
+	/* Since we are going to delete or update a row, we have to invalidate
+	the MySQL query cache for table. A deadlock of threads is not possible
+	here because the caller of this function does not hold any latches with
+	the mutex rank above the lock_sys.latch. The query cache mutex
+	has a rank just above the lock_sys.latch. */
+
+	row_ins_invalidate_query_cache(thr, table->name.m_name);
+
+	node = static_cast<upd_node_t*>(thr->run_node);
+
+	if (node->is_delete && 0 == (foreign->type
+				     & (DICT_FOREIGN_ON_DELETE_CASCADE
+					| DICT_FOREIGN_ON_DELETE_SET_NULL))) {
+
+		row_ins_foreign_report_err("Trying to delete",
+					   thr, foreign,
+					   btr_pcur_get_rec(pcur), entry);
+
+		DBUG_RETURN(DB_ROW_IS_REFERENCED);
+	}
+
+	if (!node->is_delete && 0 == (foreign->type
+				      & (DICT_FOREIGN_ON_UPDATE_CASCADE
+					 | DICT_FOREIGN_ON_UPDATE_SET_NULL))) {
+
+		/* This is an UPDATE */
+
+		row_ins_foreign_report_err("Trying to update",
+					   thr, foreign,
+					   btr_pcur_get_rec(pcur), entry);
+
+		DBUG_RETURN(DB_ROW_IS_REFERENCED);
+	}
+
+	if (node->cascade_node == NULL) {
+		node->cascade_heap = mem_heap_create(128);
+		node->cascade_node = row_create_update_node_for_mysql(
+			table, node->cascade_heap);
+		que_node_set_parent(node->cascade_node, node);
+
+	}
+	cascade = node->cascade_node;
+	cascade->table = table;
+	cascade->foreign = foreign;
+
+	if (node->is_delete
+	    && (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE)) {
+		cascade->is_delete = PLAIN_DELETE;
+	} else {
+		cascade->is_delete = NO_DELETE;
+
+		if (foreign->n_fields > cascade->update_n_fields) {
+			/* We have to make the update vector longer */
+
+			cascade->update = upd_create(foreign->n_fields,
+						     node->cascade_heap);
+			cascade->update_n_fields = foreign->n_fields;
+		}
+
+		/* We do not allow cyclic cascaded updating (DELETE is
+		allowed, but not UPDATE) of the same table, as this
+		can lead to an infinite cycle. Check that we are not
+		updating the same table which is already being
+		modified in this cascade chain. We have to check this
+		also because the modification of the indexes of a
+		'parent' table may still be incomplete, and we must
+		avoid seeing the indexes of the parent table in an
+		inconsistent state! */
+
+		if (row_ins_cascade_ancestor_updates_table(cascade, table)) {
+
+			/* We do not know if this would break foreign key
+			constraints, but play safe and return an error */
+
+			err = DB_ROW_IS_REFERENCED;
+
+			row_ins_foreign_report_err(
+				"Trying an update, possibly causing a cyclic"
+				" cascaded update\n"
+				"in the child table,", thr, foreign,
+				btr_pcur_get_rec(pcur), entry);
+
+			goto nonstandard_exit_func;
+		}
+	}
+
+	if (row_ins_cascade_n_ancestors(cascade) >= FK_MAX_CASCADE_DEL) {
+		err = DB_FOREIGN_EXCEED_MAX_CASCADE;
+
+		row_ins_foreign_report_err(
+			"Trying a too deep cascaded delete or update\n",
+			thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+		goto nonstandard_exit_func;
+	}
+
+	index = pcur->index();
+
+	ut_a(index == foreign->foreign_index);
+
+	rec = btr_pcur_get_rec(pcur);
+
+	tmp_heap = mem_heap_create(256);
+
+	if (dict_index_is_clust(index)) {
+		/* pcur is already positioned in the clustered index of
+		the child table */
+
+		clust_index = index;
+		clust_rec = rec;
+		clust_block = btr_pcur_get_block(pcur);
+	} else {
+		/* We have to look for the record in the clustered index
+		in the child table */
+
+		clust_index = dict_table_get_first_index(table);
+
+		ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec,
+					tmp_heap);
+		cascade->pcur->old_rec = nullptr;
+		cascade->pcur->btr_cur.page_cur.index = clust_index;
+		err = btr_pcur_open_with_no_init(ref,
+						 PAGE_CUR_LE, BTR_SEARCH_LEAF,
+						 cascade->pcur, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			goto nonstandard_exit_func;
+		}
+
+		clust_rec = btr_pcur_get_rec(cascade->pcur);
+		clust_block = btr_pcur_get_block(cascade->pcur);
+
+		if (!page_rec_is_user_rec(clust_rec)
+		    || btr_pcur_get_low_match(cascade->pcur)
+		    < dict_index_get_n_unique(clust_index)) {
+
+			ib::error() << "In cascade of a foreign key op index "
+				<< index->name
+				<< " of table " << index->table->name;
+
+			fputs("InnoDB: record ", stderr);
+			rec_print(stderr, rec, index);
+			fputs("\n"
+			      "InnoDB: clustered record ", stderr);
+			rec_print(stderr, clust_rec, clust_index);
+			fputs("\n"
+			      "InnoDB: Submit a detailed bug report to"
+			      " https://jira.mariadb.org/\n", stderr);
+			ut_ad(0);
+			err = DB_SUCCESS;
+
+			goto nonstandard_exit_func;
+		}
+	}
+
+	/* Set an X-lock on the row to delete or update in the child table */
+
+	err = lock_table(table, fktable, LOCK_IX, thr);
+
+	if (err == DB_SUCCESS) {
+		/* Here it suffices to use a LOCK_REC_NOT_GAP type lock;
+		we already have a normal shared lock on the appropriate
+		gap if the search criterion was not unique */
+
+		err = lock_clust_rec_read_check_and_lock_alt(
+			0, clust_block, clust_rec, clust_index,
+			LOCK_X, LOCK_REC_NOT_GAP, thr);
+	}
+
+	if (err != DB_SUCCESS) {
+
+		goto nonstandard_exit_func;
+	}
+
+	if (rec_get_deleted_flag(clust_rec, dict_table_is_comp(table))) {
+		/* In delete-marked records, DB_TRX_ID must
+		always refer to an existing undo log record. */
+		ut_ad(rec_get_trx_id(clust_rec, clust_index));
+		/* This can happen if there is a circular reference of
+		rows such that cascading delete comes to delete a row
+		already in the process of being delete marked */
+		err = DB_SUCCESS;
+
+		goto nonstandard_exit_func;
+	}
+
+	if (table->fts) {
+		doc_id = fts_get_doc_id_from_rec(
+			clust_rec, clust_index,
+			rec_get_offsets(clust_rec, clust_index, NULL,
+					clust_index->n_core_fields,
+					ULINT_UNDEFINED, &tmp_heap));
+	}
+
+	if (node->is_delete
+	    ? (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL)
+	    : (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL)) {
+		/* Build the appropriate update vector which sets
+		foreign->n_fields first fields in rec to SQL NULL */
+
+		update = cascade->update;
+
+		update->info_bits = 0;
+		update->n_fields = foreign->n_fields;
+		MEM_UNDEFINED(update->fields,
+			      update->n_fields * sizeof *update->fields);
+
+		for (ulint i = 0; i < foreign->n_fields; i++) {
+			upd_field_t*	ufield = &update->fields[i];
+			ulint		col_no = dict_index_get_nth_col_no(
+						index, i);
+			ulint		prefix_col;
+
+			ufield->field_no = static_cast<uint16_t>(
+				dict_table_get_nth_col_pos(
+					table, col_no, &prefix_col));
+			dict_col_t*	col = dict_table_get_nth_col(
+				table, col_no);
+			dict_col_copy_type(col, dfield_get_type(&ufield->new_val));
+
+			ufield->orig_len = 0;
+			ufield->exp = NULL;
+			dfield_set_null(&ufield->new_val);
+		}
+
+		if (foreign->affects_fulltext()) {
+			fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL);
+		}
+
+		if (foreign->v_cols != NULL
+		    && foreign->v_cols->size() > 0) {
+			err = row_ins_foreign_fill_virtual(
+				cascade, clust_rec, clust_index,
+				node, foreign);
+
+			if (err != DB_SUCCESS) {
+				goto nonstandard_exit_func;
+			}
+		}
+	} else if (table->fts && cascade->is_delete == PLAIN_DELETE
+		   && foreign->affects_fulltext()) {
+		/* DICT_FOREIGN_ON_DELETE_CASCADE case */
+		fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL);
+	}
+
+	if (!node->is_delete
+	    && (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE)) {
+
+		/* Build the appropriate update vector which sets changing
+		foreign->n_fields first fields in rec to new values */
+
+		bool affects_fulltext = row_ins_cascade_calc_update_vec(
+			node, foreign, tmp_heap, trx);
+
+		if (foreign->v_cols && !foreign->v_cols->empty()) {
+			err = row_ins_foreign_fill_virtual(
+				cascade, clust_rec, clust_index,
+				node, foreign);
+
+			if (err != DB_SUCCESS) {
+				goto nonstandard_exit_func;
+			}
+		}
+
+		switch (cascade->update->n_fields) {
+		case ULINT_UNDEFINED:
+			err = DB_ROW_IS_REFERENCED;
+
+			row_ins_foreign_report_err(
+				"Trying a cascaded update where the"
+				" updated value in the child\n"
+				"table would not fit in the length"
+				" of the column, or the value would\n"
+				"be NULL and the column is"
+				" declared as not NULL in the child table,",
+				thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+			goto nonstandard_exit_func;
+		case 0:
+			/* The update does not change any columns referred
+			to in this foreign key constraint: no need to do
+			anything */
+
+			err = DB_SUCCESS;
+
+			goto nonstandard_exit_func;
+		}
+
+		/* Mark the old Doc ID as deleted */
+		if (affects_fulltext) {
+			ut_ad(table->fts);
+			fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL);
+		}
+	}
+
+	if (table->versioned() && cascade->is_delete != PLAIN_DELETE
+	    && cascade->update->affects_versioned()) {
+		ut_ad(!cascade->historical_heap);
+		cascade->historical_heap = mem_heap_create(srv_page_size);
+		cascade->historical_row = row_build(
+			ROW_COPY_DATA, clust_index, clust_rec, NULL, table,
+			NULL, NULL, NULL, cascade->historical_heap);
+	}
+
+	/* Store pcur position and initialize or store the cascade node
+	pcur stored position */
+
+	btr_pcur_store_position(pcur, mtr);
+
+	if (index == clust_index) {
+		btr_pcur_copy_stored_position(cascade->pcur, pcur);
+	} else {
+		btr_pcur_store_position(cascade->pcur, mtr);
+	}
+
+#ifdef WITH_WSREP
+	if (trx->is_wsrep()) {
+		err = wsrep_append_foreign_key(trx, foreign, clust_rec, clust_index,
+					       false, NULL, true,
+					       WSREP_SERVICE_KEY_EXCLUSIVE);
+		if (err != DB_SUCCESS) {
+			goto nonstandard_exit_func;
+		}
+	}
+#endif /* WITH_WSREP */
+	mtr_commit(mtr);
+
+	ut_a(cascade->pcur->rel_pos == BTR_PCUR_ON);
+
+	cascade->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	err = row_update_cascade_for_mysql(thr, cascade,
+					   foreign->foreign_table);
+
+	mtr_start(mtr);
+
+	/* Restore pcur position */
+
+	if (pcur->restore_position(BTR_SEARCH_LEAF, mtr)
+	    != btr_pcur_t::SAME_ALL) {
+		err = DB_CORRUPTION;
+	}
+
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	DBUG_RETURN(err);
+
+nonstandard_exit_func:
+
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	btr_pcur_store_position(pcur, mtr);
+
+	mtr_commit(mtr);
+	mtr_start(mtr);
+
+	if (pcur->restore_position(BTR_SEARCH_LEAF, mtr)
+	    != btr_pcur_t::SAME_ALL && err == DB_SUCCESS) {
+		err = DB_CORRUPTION;
+	}
+
+	DBUG_RETURN(err);
+}
+
+/*********************************************************************//**
+Sets a shared lock on a record. Used in locking possible duplicate key
+records and also in checking foreign key constraints.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+static
+dberr_t
+row_ins_set_shared_rec_lock(
+/*========================*/
+	unsigned		type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP type lock */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: record */
+	dict_index_t*		index,	/*!< in: index */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (dict_index_is_clust(index)) {
+		err = lock_clust_rec_read_check_and_lock(
+			0, block, rec, index, offsets, LOCK_S, type, thr);
+	} else {
+		err = lock_sec_rec_read_check_and_lock(
+			0, block, rec, index, offsets, LOCK_S, type, thr);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Sets a exclusive lock on a record. Used in locking possible duplicate key
+records
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+static
+dberr_t
+row_ins_set_exclusive_rec_lock(
+/*===========================*/
+	unsigned		type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP type lock */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: record */
+	dict_index_t*		index,	/*!< in: index */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (dict_index_is_clust(index)) {
+		err = lock_clust_rec_read_check_and_lock(
+			0, block, rec, index, offsets, LOCK_X, type, thr);
+	} else {
+		err = lock_sec_rec_read_check_and_lock(
+			0, block, rec, index, offsets, LOCK_X, type, thr);
+	}
+
+	return(err);
+}
+
+/***************************************************************//**
+Checks if foreign key constraint fails for an index entry. Sets shared locks
+which lock either the success or the failure of the constraint. NOTE that
+the caller must have a shared latch on dict_sys.latch.
+@return DB_SUCCESS, DB_NO_REFERENCED_ROW, or DB_ROW_IS_REFERENCED */
+dberr_t
+row_ins_check_foreign_constraint(
+/*=============================*/
+	ibool		check_ref,/*!< in: TRUE if we want to check that
+				the referenced table is ok, FALSE if we
+				want to check the foreign key table */
+	dict_foreign_t*	foreign,/*!< in: foreign constraint; NOTE that the
+				tables mentioned in it must be in the
+				dictionary cache if they exist at all */
+	dict_table_t*	table,	/*!< in: if check_ref is TRUE, then the foreign
+				table, else the referenced table */
+	dtuple_t*	entry,	/*!< in: index entry for index */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	upd_node_t*	upd_node;
+	ulint		n_fields_cmp;
+	btr_pcur_t	pcur;
+	int		cmp;
+	mtr_t		mtr;
+	trx_t*		trx		= thr_get_trx(thr);
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+
+	bool		skip_gap_lock;
+
+	skip_gap_lock = (trx->isolation_level <= TRX_ISO_READ_COMMITTED);
+
+	DBUG_ENTER("row_ins_check_foreign_constraint");
+
+	rec_offs_init(offsets_);
+
+#ifdef WITH_WSREP
+	upd_node= NULL;
+#endif /* WITH_WSREP */
+
+	if (!trx->check_foreigns) {
+		/* The user has suppressed foreign key checks currently for
+		this session */
+		DBUG_RETURN(DB_SUCCESS);
+	}
+
+	/* If any of the foreign key fields in entry is SQL NULL, we
+	suppress the foreign key check: this is compatible with Oracle,
+	for example */
+	for (ulint i = 0; i < entry->n_fields; i++) {
+		dfield_t* field = dtuple_get_nth_field(entry, i);
+		if (i < foreign->n_fields && dfield_is_null(field)) {
+			DBUG_RETURN(DB_SUCCESS);
+		}
+		/* System Versioning: if row_end != Inf, we
+		suppress the foreign key check */
+		if (field->type.vers_sys_end() && field->vers_history_row()) {
+			DBUG_RETURN(DB_SUCCESS);
+		}
+	}
+
+	if (que_node_get_type(thr->run_node) == QUE_NODE_UPDATE) {
+		upd_node = static_cast<upd_node_t*>(thr->run_node);
+
+		if (upd_node->is_delete != PLAIN_DELETE
+		    && upd_node->foreign == foreign) {
+			/* If a cascaded update is done as defined by a
+			foreign key constraint, do not check that
+			constraint for the child row. In ON UPDATE CASCADE
+			the update of the parent row is only half done when
+			we come here: if we would check the constraint here
+			for the child row it would fail.
+
+			A QUESTION remains: if in the child table there are
+			several constraints which refer to the same parent
+			table, we should merge all updates to the child as
+			one update? And the updates can be contradictory!
+			Currently we just perform the update associated
+			with each foreign key constraint, one after
+			another, and the user has problems predicting in
+			which order they are performed. */
+
+			DBUG_RETURN(DB_SUCCESS);
+		}
+	}
+
+	if (que_node_get_type(thr->run_node) == QUE_NODE_INSERT) {
+		ins_node_t* insert_node =
+			static_cast<ins_node_t*>(thr->run_node);
+		dict_table_t* table = insert_node->index->table;
+		if (table->versioned()) {
+			dfield_t* row_end = dtuple_get_nth_field(
+				insert_node->row, table->vers_end);
+			if (row_end->vers_history_row()) {
+				DBUG_RETURN(DB_SUCCESS);
+			}
+		}
+	}
+
+	dict_table_t *check_table;
+	dict_index_t *check_index;
+	dberr_t err = DB_SUCCESS;
+
+	{
+		dict_table_t*& fktable = check_ref
+			? foreign->referenced_table : foreign->foreign_table;
+		check_table = fktable;
+		if (check_table) {
+			err = lock_table(check_table, &fktable, LOCK_IS, thr);
+			if (err != DB_SUCCESS) {
+				goto do_possible_lock_wait;
+			}
+		}
+		check_table = fktable;
+	}
+
+	check_index = check_ref
+		? foreign->referenced_index : foreign->foreign_index;
+
+	if (!check_table || !check_table->is_readable() || !check_index) {
+		FILE*	ef = dict_foreign_err_file;
+		std::string fk_str;
+
+		row_ins_set_detailed(trx, foreign);
+		row_ins_foreign_trx_print(trx);
+
+		fputs("Foreign key constraint fails for table ", ef);
+		ut_print_name(ef, trx, check_ref
+			      ? foreign->foreign_table_name
+			      : foreign->referenced_table_name);
+		fputs(":\n", ef);
+		fk_str = dict_print_info_on_foreign_key_in_create_format(
+			trx, foreign, TRUE);
+		fputs(fk_str.c_str(), ef);
+		if (check_ref) {
+			if (foreign->foreign_index) {
+				fprintf(ef, "\nTrying to add to index %s"
+					" tuple:\n",
+					foreign->foreign_index->name());
+			} else {
+				fputs("\nTrying to add tuple:\n", ef);
+			}
+			dtuple_print(ef, entry);
+			fputs("\nBut the parent table ", ef);
+			ut_print_name(ef, trx, foreign->referenced_table_name);
+			fputs("\nor its .ibd file or the required index does"
+			      " not currently exist!\n", ef);
+			err = DB_NO_REFERENCED_ROW;
+		} else {
+			if (foreign->referenced_index) {
+				fprintf(ef, "\nTrying to modify index %s"
+					" tuple:\n",
+					foreign->referenced_index->name());
+			} else {
+				fputs("\nTrying to modify tuple:\n", ef);
+			}
+			dtuple_print(ef, entry);
+			fputs("\nBut the referencing table ", ef);
+			ut_print_name(ef, trx, foreign->foreign_table_name);
+			fputs("\nor its .ibd file or the required index does"
+			      " not currently exist!\n", ef);
+			err = DB_ROW_IS_REFERENCED;
+		}
+
+		mysql_mutex_unlock(&dict_foreign_err_mutex);
+		goto exit_func;
+	}
+
+	mtr_start(&mtr);
+
+	/* Store old value on n_fields_cmp */
+
+	n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+	dtuple_set_n_fields_cmp(entry, foreign->n_fields);
+	pcur.btr_cur.page_cur.index = check_index;
+	err = btr_pcur_open(entry, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur, &mtr);
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		goto end_scan;
+	}
+
+	/* Scan index records and check if there is a matching record */
+
+	do {
+		const rec_t*		rec = btr_pcur_get_rec(&pcur);
+		const buf_block_t*	block = btr_pcur_get_block(&pcur);
+
+		if (page_rec_is_infimum(rec)) {
+
+			continue;
+		}
+
+		offsets = rec_get_offsets(rec, check_index, offsets,
+					  check_index->n_core_fields,
+					  ULINT_UNDEFINED, &heap);
+
+		if (page_rec_is_supremum(rec)) {
+
+			if (skip_gap_lock) {
+
+				continue;
+			}
+
+			err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, block,
+							  rec, check_index,
+							  offsets, thr);
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+			case DB_SUCCESS:
+				continue;
+			default:
+				goto end_scan;
+			}
+		}
+
+		cmp = cmp_dtuple_rec(entry, rec, check_index, offsets);
+
+		if (cmp == 0) {
+			if (rec_get_deleted_flag(rec,
+						 rec_offs_comp(offsets))) {
+				/* In delete-marked records, DB_TRX_ID must
+				always refer to an existing undo log record. */
+				ut_ad(!dict_index_is_clust(check_index)
+				      || row_get_rec_trx_id(rec, check_index,
+							    offsets));
+
+				err = row_ins_set_shared_rec_lock(
+					skip_gap_lock
+					? LOCK_REC_NOT_GAP
+					: LOCK_ORDINARY, block,
+					rec, check_index, offsets, thr);
+				switch (err) {
+				case DB_SUCCESS_LOCKED_REC:
+				case DB_SUCCESS:
+					break;
+				default:
+					goto end_scan;
+				}
+			} else {
+				if (check_table->versioned()) {
+					bool history_row = false;
+
+					if (check_index->is_primary()) {
+						history_row = check_index->
+							vers_history_row(rec,
+									 offsets);
+					} else if (check_index->
+						vers_history_row(rec,
+								 history_row)) {
+						break;
+					}
+
+					if (history_row) {
+						continue;
+					}
+				}
+				/* Found a matching record. Lock only
+				a record because we can allow inserts
+				into gaps */
+
+				err = row_ins_set_shared_rec_lock(
+					LOCK_REC_NOT_GAP, block,
+					rec, check_index, offsets, thr);
+
+				switch (err) {
+				case DB_SUCCESS_LOCKED_REC:
+				case DB_SUCCESS:
+					break;
+				default:
+					goto end_scan;
+				}
+
+				if (check_ref) {
+					err = DB_SUCCESS;
+#ifdef WITH_WSREP
+					if (trx->is_wsrep()) {
+						err = wsrep_append_foreign_key(
+							thr_get_trx(thr),
+							foreign,
+							rec,
+							check_index,
+							check_ref,
+							upd_node,
+						        false,
+						        WSREP_SERVICE_KEY_REFERENCE);
+					}
+#endif /* WITH_WSREP */
+					goto end_scan;
+				} else if (foreign->type != 0) {
+					/* There is an ON UPDATE or ON DELETE
+					condition: check them in a separate
+					function */
+
+					err = row_ins_foreign_check_on_constraint(
+						thr, foreign, &pcur, entry,
+						&mtr);
+					if (err != DB_SUCCESS) {
+						/* Since reporting a plain
+						"duplicate key" error
+						message to the user in
+						cases where a long CASCADE
+						operation would lead to a
+						duplicate key in some
+						other table is very
+						confusing, map duplicate
+						key errors resulting from
+						FK constraints to a
+						separate error code. */
+
+						if (err == DB_DUPLICATE_KEY) {
+							err = DB_FOREIGN_DUPLICATE_KEY;
+						}
+
+						goto end_scan;
+					}
+
+					/* row_ins_foreign_check_on_constraint
+					may have repositioned pcur on a
+					different block */
+					block = btr_pcur_get_block(&pcur);
+				} else {
+					row_ins_foreign_report_err(
+						"Trying to delete or update",
+						thr, foreign, rec, entry);
+
+					err = DB_ROW_IS_REFERENCED;
+					goto end_scan;
+				}
+			}
+		} else {
+			ut_a(cmp < 0);
+
+			err = skip_gap_lock
+				? DB_SUCCESS
+				: row_ins_set_shared_rec_lock(
+					LOCK_GAP, block,
+					rec, check_index, offsets, thr);
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+				err = DB_SUCCESS;
+				/* fall through */
+			case DB_SUCCESS:
+				if (check_ref) {
+					err = DB_NO_REFERENCED_ROW;
+					row_ins_foreign_report_add_err(
+						trx, foreign, rec, entry);
+				}
+			default:
+				break;
+			}
+
+			goto end_scan;
+		}
+	} while (btr_pcur_move_to_next(&pcur, &mtr));
+
+	if (check_ref) {
+		row_ins_foreign_report_add_err(
+			trx, foreign, btr_pcur_get_rec(&pcur), entry);
+		err = DB_NO_REFERENCED_ROW;
+	} else {
+		err = DB_SUCCESS;
+	}
+
+end_scan:
+	mtr_commit(&mtr);
+	ut_free(pcur.old_rec_buf);
+
+	/* Restore old value */
+	dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+do_possible_lock_wait:
+	if (err == DB_LOCK_WAIT) {
+		trx->error_state = err;
+
+		thr->lock_state = QUE_THR_LOCK_ROW;
+
+		err = lock_wait(thr);
+
+		thr->lock_state = QUE_THR_LOCK_NOLOCK;
+
+		if (err == DB_SUCCESS) {
+			err = DB_LOCK_WAIT;
+		}
+	}
+
+exit_func:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	DBUG_RETURN(err);
+}
+
+/** Sets the values of the dtuple fields in ref_entry from the values of
+foreign columns in entry.
+@param[in]	foreign		foreign key constraint
+@param[in]	index		clustered index
+@param[in]	entry		tuple of clustered index
+@param[in]	ref_entry	tuple of foreign columns
+@return true if all foreign key fields present in clustered index */
+static
+bool row_ins_foreign_index_entry(dict_foreign_t *foreign,
+                                 const dict_index_t *index,
+                                 const dtuple_t *entry,
+                                 dtuple_t *ref_entry)
+{
+  for (ulint i= 0; i < foreign->n_fields; i++)
+  {
+    for (ulint j= 0; j < index->n_fields; j++)
+    {
+      const dict_col_t *col= dict_index_get_nth_col(index, j);
+
+      /* A clustered index may contain instantly dropped columns,
+      which must be skipped. */
+      if (col->is_dropped())
+        continue;
+
+      const char *col_name= dict_table_get_col_name(index->table, col->ind);
+      if (0 == innobase_strcasecmp(col_name, foreign->foreign_col_names[i]))
+      {
+        dfield_copy(&ref_entry->fields[i], &entry->fields[j]);
+        goto got_match;
+      }
+    }
+    return false;
+got_match:
+    continue;
+  }
+
+  return true;
+}
+
+/***************************************************************//**
+Checks if foreign key constraints fail for an index entry. If index
+is not mentioned in any constraint, this function does nothing,
+Otherwise does searches to the indexes of referenced tables and
+sets shared locks which lock either the success or the failure of
+a constraint.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_check_foreign_constraints(
+/*==============================*/
+	dict_table_t*	table,	/*!< in: table */
+	dict_index_t*	index,	/*!< in: index */
+	bool		pk,	/*!< in: index->is_primary() */
+	dtuple_t*	entry,	/*!< in: index entry for index */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dict_foreign_t*	foreign;
+	dberr_t		err = DB_SUCCESS;
+	mem_heap_t*	heap = NULL;
+
+	DBUG_ASSERT(index->is_primary() == pk);
+
+	DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+			    "foreign_constraint_check_for_ins");
+
+	for (dict_foreign_set::iterator it = table->foreign_set.begin();
+	     err == DB_SUCCESS && it != table->foreign_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		if (foreign->foreign_index == index
+		    || (pk && !foreign->foreign_index)) {
+
+			dtuple_t*	ref_tuple = entry;
+			if (UNIV_UNLIKELY(!foreign->foreign_index)) {
+				/* Change primary key entry to
+				foreign key index entry */
+				if (!heap) {
+					heap = mem_heap_create(1000);
+				} else {
+					mem_heap_empty(heap);
+				}
+
+				ref_tuple = dtuple_create(
+					heap, foreign->n_fields);
+				dtuple_set_n_fields_cmp(
+					ref_tuple, foreign->n_fields);
+				if (!row_ins_foreign_index_entry(
+					foreign, index, entry, ref_tuple)) {
+					err = DB_NO_REFERENCED_ROW;
+					break;
+				}
+
+			}
+
+			dict_table_t*	ref_table = NULL;
+			dict_table_t*	referenced_table
+						= foreign->referenced_table;
+
+			if (referenced_table == NULL) {
+
+				ref_table = dict_table_open_on_name(
+					foreign->referenced_table_name_lookup,
+					false, DICT_ERR_IGNORE_NONE);
+			}
+
+			err = row_ins_check_foreign_constraint(
+				TRUE, foreign, table, ref_tuple, thr);
+
+			if (ref_table) {
+				dict_table_close(ref_table);
+			}
+		}
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return err;
+}
+
+/***************************************************************//**
+Checks if a unique key violation to rec would occur at the index entry
+insert.
+@return TRUE if error */
+static
+ibool
+row_ins_dupl_error_with_rec(
+/*========================*/
+	const rec_t*	rec,	/*!< in: user record; NOTE that we assume
+				that the caller already has a record lock on
+				the record! */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	dict_index_t*	index,	/*!< in: index */
+	const rec_offs*	offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	ulint	matched_fields;
+	ulint	n_unique;
+	ulint	i;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	n_unique = dict_index_get_n_unique(index);
+
+	matched_fields = 0;
+
+	cmp_dtuple_rec_with_match(entry, rec, index, offsets, &matched_fields);
+
+	if (matched_fields < n_unique) {
+
+		return(FALSE);
+	}
+
+	/* In a unique secondary index we allow equal key values if they
+	contain SQL NULLs */
+
+	if (!dict_index_is_clust(index) && !index->nulls_equal) {
+
+		for (i = 0; i < n_unique; i++) {
+			if (dfield_is_null(dtuple_get_nth_field(entry, i))) {
+
+				return(FALSE);
+			}
+		}
+	}
+
+	return(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+}
+
+/** Determine whether a history row was inserted by this transaction
+(row TRX_ID is the same as current TRX_ID).
+@param index  secondary index
+@param rec    secondary index record
+@param trx    transaction
+@return error code
+@retval DB_SUCCESS                on success
+@retval DB_FOREIGN_DUPLICATE_KEY  if a history row was inserted by trx */
+static dberr_t vers_row_same_trx(dict_index_t* index, const rec_t* rec,
+                                 const trx_t& trx)
+{
+  mtr_t mtr;
+  dberr_t ret= DB_SUCCESS;
+  dict_index_t *clust_index= dict_table_get_first_index(index->table);
+  ut_ad(index != clust_index);
+
+  mtr.start();
+
+  if (const rec_t *clust_rec=
+      row_get_clust_rec(BTR_SEARCH_LEAF, rec, index, &clust_index, &mtr))
+  {
+    rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+    rec_offs *clust_offs= offsets_;
+    rec_offs_init(offsets_);
+    mem_heap_t *heap= NULL;
+
+    clust_offs=
+      rec_get_offsets(clust_rec, clust_index, clust_offs,
+                      clust_index->n_core_fields, ULINT_UNDEFINED, &heap);
+    if (clust_index->vers_history_row(clust_rec, clust_offs))
+    {
+      ulint trx_id_len;
+      const byte *trx_id= rec_get_nth_field(clust_rec, clust_offs,
+                                            clust_index->n_uniq, &trx_id_len);
+      ut_ad(trx_id_len == DATA_TRX_ID_LEN);
+
+      if (trx.id == trx_read_trx_id(trx_id))
+        ret= DB_FOREIGN_DUPLICATE_KEY;
+    }
+
+    if (UNIV_LIKELY_NULL(heap))
+      mem_heap_free(heap);
+  }
+  else
+  {
+    ib::error() << "foreign constraints: secondary index " << index->name <<
+                   " of table " << index->table->name << " is out of sync";
+    ut_ad("secondary index is out of sync" == 0);
+    ret= DB_TABLE_CORRUPT;
+  }
+
+  mtr.commit();
+  return ret;
+}
+
+/***************************************************************//**
+Scans a unique non-clustered index at a given index entry to determine
+whether a uniqueness violation has occurred for the key value of the entry.
+Set shared locks on possible duplicate records.
+@return DB_SUCCESS, DB_DUPLICATE_KEY, or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_scan_sec_index_for_duplicate(
+/*=================================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	dict_index_t*	index,	/*!< in: non-clustered unique index */
+	dtuple_t*	entry,	/*!< in: index entry */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	mem_heap_t*	offsets_heap)
+				/*!< in/out: memory heap that can be emptied */
+{
+	ulint		n_unique;
+	int		cmp;
+	ulint		n_fields_cmp;
+	btr_pcur_t	pcur;
+	rec_offs	offsets_[REC_OFFS_SEC_INDEX_SIZE];
+	rec_offs*	offsets		= offsets_;
+	DBUG_ENTER("row_ins_scan_sec_index_for_duplicate");
+
+	rec_offs_init(offsets_);
+
+	ut_ad(!index->lock.have_any());
+
+	n_unique = dict_index_get_n_unique(index);
+
+	/* If the secondary index is unique, but one of the fields in the
+	n_unique first fields is NULL, a unique key violation cannot occur,
+	since we define NULL != NULL in this case */
+
+	if (!index->nulls_equal) {
+		for (ulint i = 0; i < n_unique; i++) {
+			if (UNIV_SQL_NULL == dfield_get_len(
+					dtuple_get_nth_field(entry, i))) {
+
+				DBUG_RETURN(DB_SUCCESS);
+			}
+		}
+	}
+
+	/* Store old value on n_fields_cmp */
+
+	n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+	dtuple_set_n_fields_cmp(entry, n_unique);
+	pcur.btr_cur.page_cur.index = index;
+	trx_t* const trx = thr_get_trx(thr);
+	dberr_t err = btr_pcur_open(entry, PAGE_CUR_GE, BTR_SEARCH_LEAF,
+				    &pcur, mtr);
+	if (err != DB_SUCCESS) {
+		goto end_scan;
+	}
+
+	/* Scan index records and check if there is a duplicate */
+
+	do {
+		const rec_t*		rec	= btr_pcur_get_rec(&pcur);
+		const buf_block_t*	block	= btr_pcur_get_block(&pcur);
+		const ulint		lock_type = LOCK_ORDINARY;
+
+		if (page_rec_is_infimum(rec)) {
+
+			continue;
+		}
+
+		offsets = rec_get_offsets(rec, index, offsets,
+					  index->n_core_fields,
+					  ULINT_UNDEFINED, &offsets_heap);
+
+		if (flags & BTR_NO_LOCKING_FLAG) {
+			/* Set no locks when applying log
+			in online table rebuild. */
+		} else if (trx->duplicates) {
+
+			/* If the SQL-query will update or replace
+			duplicate key we will take X-lock for
+			duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+			INSERT ON DUPLICATE KEY UPDATE). */
+
+			err = row_ins_set_exclusive_rec_lock(
+				lock_type, block, rec, index, offsets, thr);
+		} else {
+
+			err = row_ins_set_shared_rec_lock(
+				lock_type, block, rec, index, offsets, thr);
+		}
+
+		switch (err) {
+		case DB_SUCCESS_LOCKED_REC:
+			err = DB_SUCCESS;
+		case DB_SUCCESS:
+			break;
+		default:
+			goto end_scan;
+		}
+
+		if (page_rec_is_supremum(rec)) {
+
+			continue;
+		}
+
+		cmp = cmp_dtuple_rec(entry, rec, index, offsets);
+
+		if (cmp == 0) {
+			if (row_ins_dupl_error_with_rec(rec, entry,
+							index, offsets)) {
+
+				err = DB_DUPLICATE_KEY;
+
+				trx->error_info = index;
+
+				if (!index->table->versioned()) {
+				} else if (dberr_t e =
+					   vers_row_same_trx(index, rec,
+							     *trx)) {
+					err = e;
+					goto end_scan;
+				}
+
+				/* If the duplicate is on hidden FTS_DOC_ID,
+				state so in the error log */
+				if (index == index->table->fts_doc_id_index
+				    && DICT_TF2_FLAG_IS_SET(
+					index->table,
+					DICT_TF2_FTS_HAS_DOC_ID)) {
+
+					ib::error() << "Duplicate FTS_DOC_ID"
+						" value on table "
+						<< index->table->name;
+				}
+
+				goto end_scan;
+			}
+		} else {
+			ut_a(cmp < 0);
+			goto end_scan;
+		}
+	} while (btr_pcur_move_to_next(&pcur, mtr));
+
+end_scan:
+	/* Restore old value */
+	dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+	DBUG_RETURN(err);
+}
+
+/** Checks for a duplicate when the table is being rebuilt online.
+@param n_uniq   index->db_trx_id()
+@param entry    entry being inserted
+@param rec      clustered index record at insert position
+@param index    clustered index
+@param offsets  rec_get_offsets(rec)
+@retval DB_SUCCESS when no duplicate is detected
+@retval DB_SUCCESS_LOCKED_REC when rec is an exact match of entry or
+a newer version of entry (the entry should not be inserted)
+@retval DB_DUPLICATE_KEY when entry is a duplicate of rec */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_duplicate_online(ulint n_uniq, const dtuple_t *entry,
+                         const rec_t *rec, const dict_index_t *index,
+                         rec_offs *offsets)
+{
+	ulint	fields	= 0;
+
+	/* During rebuild, there should not be any delete-marked rows
+	in the new table. */
+	ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+	ut_ad(dtuple_get_n_fields_cmp(entry) == n_uniq);
+	ut_ad(n_uniq == index->db_trx_id());
+
+	/* Compare the PRIMARY KEY fields and the DB_TRX_ID, DB_ROLL_PTR. */
+	cmp_dtuple_rec_with_match_low(entry, rec, index, offsets, n_uniq + 2,
+				      &fields);
+
+	if (fields < n_uniq) {
+		/* Not a duplicate. */
+		return(DB_SUCCESS);
+	}
+
+	ulint trx_id_len;
+
+	if (fields == n_uniq + 2
+	    && memcmp(rec_get_nth_field(rec, offsets, n_uniq, &trx_id_len),
+		      reset_trx_id, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) {
+		ut_ad(trx_id_len == DATA_TRX_ID_LEN);
+		/* rec is an exact match of entry, and DB_TRX_ID belongs
+		to a transaction that started after our ALTER TABLE. */
+		return(DB_SUCCESS_LOCKED_REC);
+	}
+
+	return(DB_DUPLICATE_KEY);
+}
+
+/** Checks for a duplicate when the table is being rebuilt online.
+@retval DB_SUCCESS when no duplicate is detected
+@retval DB_SUCCESS_LOCKED_REC when rec is an exact match of entry or
+a newer version of entry (the entry should not be inserted)
+@retval DB_DUPLICATE_KEY when entry is a duplicate of rec */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_duplicate_error_in_clust_online(
+/*====================================*/
+	ulint		n_uniq,	/*!< in: offset of DB_TRX_ID */
+	const dtuple_t*	entry,	/*!< in: entry that is being inserted */
+	const btr_cur_t*cursor,	/*!< in: cursor on insert position */
+	rec_offs**	offsets,/*!< in/out: rec_get_offsets(rec) */
+	mem_heap_t**	heap)	/*!< in/out: heap for offsets */
+{
+	dberr_t		err	= DB_SUCCESS;
+	const rec_t*	rec	= btr_cur_get_rec(cursor);
+
+	ut_ad(!cursor->index()->is_instant());
+
+	if (cursor->low_match >= n_uniq && !page_rec_is_infimum(rec)) {
+		*offsets = rec_get_offsets(rec, cursor->index(), *offsets,
+					   cursor->index()->n_fields,
+					   ULINT_UNDEFINED, heap);
+		err = row_ins_duplicate_online(n_uniq, entry,
+					       rec, cursor->index(), *offsets);
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	}
+
+	if (!(rec = page_rec_get_next_const(btr_cur_get_rec(cursor)))) {
+		return DB_CORRUPTION;
+	}
+
+	if (cursor->up_match >= n_uniq && !page_rec_is_supremum(rec)) {
+		*offsets = rec_get_offsets(rec, cursor->index(), *offsets,
+					   cursor->index()->n_fields,
+					   ULINT_UNDEFINED, heap);
+		err = row_ins_duplicate_online(n_uniq, entry,
+					       rec, cursor->index(), *offsets);
+	}
+
+	return(err);
+}
+
+/***************************************************************//**
+Checks if a unique key violation error would occur at an index entry
+insert. Sets shared locks on possible duplicate records. Works only
+for a clustered index!
+@retval DB_SUCCESS if no error
+@retval DB_DUPLICATE_KEY if error,
+@retval DB_LOCK_WAIT if we have to wait for a lock on a possible duplicate
+record */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_duplicate_error_in_clust(
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: B-tree cursor */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+	rec_t*	rec;
+	ulint	n_unique;
+	trx_t*	trx		= thr_get_trx(thr);
+	mem_heap_t*heap		= NULL;
+	rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs* offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(cursor->index()->is_clust());
+
+	/* NOTE: For unique non-clustered indexes there may be any number
+	of delete marked records with the same value for the non-clustered
+	index key (remember multiversioning), and which differ only in
+	the row refererence part of the index record, containing the
+	clustered index key fields. For such a secondary index record,
+	to avoid race condition, we must FIRST do the insertion and after
+	that check that the uniqueness condition is not breached! */
+
+	/* NOTE: A problem is that in the B-tree node pointers on an
+	upper level may match more to the entry than the actual existing
+	user records on the leaf level. So, even if low_match would suggest
+	that a duplicate key violation may occur, this may not be the case. */
+
+	n_unique = dict_index_get_n_unique(cursor->index());
+
+	if (cursor->low_match >= n_unique) {
+
+		rec = btr_cur_get_rec(cursor);
+
+		if (!page_rec_is_infimum(rec)) {
+			offsets = rec_get_offsets(rec, cursor->index(),
+						  offsets,
+						  cursor->index()
+						  ->n_core_fields,
+						  ULINT_UNDEFINED, &heap);
+
+			/* We set a lock on the possible duplicate: this
+			is needed in logical logging of MySQL to make
+			sure that in roll-forward we get the same duplicate
+			errors as in original execution */
+
+			if (flags & BTR_NO_LOCKING_FLAG) {
+				/* Do nothing if no-locking is set */
+				err = DB_SUCCESS;
+			} else if (trx->duplicates) {
+
+				/* If the SQL-query will update or replace
+				duplicate key we will take X-lock for
+				duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+				INSERT ON DUPLICATE KEY UPDATE). */
+
+				err = row_ins_set_exclusive_rec_lock(
+					LOCK_REC_NOT_GAP,
+					btr_cur_get_block(cursor),
+					rec, cursor->index(), offsets, thr);
+			} else {
+
+				err = row_ins_set_shared_rec_lock(
+					LOCK_REC_NOT_GAP,
+					btr_cur_get_block(cursor), rec,
+					cursor->index(), offsets, thr);
+			}
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+			case DB_SUCCESS:
+				break;
+			default:
+				goto func_exit;
+			}
+
+			if (row_ins_dupl_error_with_rec(
+				    rec, entry, cursor->index(), offsets)) {
+duplicate:
+				trx->error_info = cursor->index();
+				err = DB_DUPLICATE_KEY;
+				if (thr->prebuilt
+				    && thr->prebuilt->upd_node
+				    && thr->prebuilt->upd_node->is_delete
+					== VERSIONED_DELETE
+				    && entry->vers_history_row())
+				{
+					ulint trx_id_len;
+					byte *trx_id = rec_get_nth_field(
+						rec, offsets, n_unique,
+						&trx_id_len);
+					ut_ad(trx_id_len == DATA_TRX_ID_LEN);
+					if (trx->id == trx_read_trx_id(trx_id)) {
+						err = DB_FOREIGN_DUPLICATE_KEY;
+					}
+				}
+				goto func_exit;
+			}
+		}
+	}
+
+	err = DB_SUCCESS;
+
+	if (cursor->up_match >= n_unique) {
+
+		rec = page_rec_get_next(btr_cur_get_rec(cursor));
+
+		if (rec && !page_rec_is_supremum(rec)) {
+			offsets = rec_get_offsets(rec, cursor->index(),
+						  offsets,
+						  cursor->index()
+						  ->n_core_fields,
+						  ULINT_UNDEFINED, &heap);
+
+			if (trx->duplicates) {
+
+				/* If the SQL-query will update or replace
+				duplicate key we will take X-lock for
+				duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+				INSERT ON DUPLICATE KEY UPDATE). */
+
+				err = row_ins_set_exclusive_rec_lock(
+					LOCK_REC_NOT_GAP,
+					btr_cur_get_block(cursor),
+					rec, cursor->index(), offsets, thr);
+			} else {
+
+				err = row_ins_set_shared_rec_lock(
+					LOCK_REC_NOT_GAP,
+					btr_cur_get_block(cursor),
+					rec, cursor->index(), offsets, thr);
+			}
+
+			switch (err) {
+			default:
+				break;
+			case DB_SUCCESS_LOCKED_REC:
+				err = DB_SUCCESS;
+				/* fall through */
+			case DB_SUCCESS:
+				if (row_ins_dupl_error_with_rec(
+					    rec, entry, cursor->index(),
+					    offsets)) {
+					goto duplicate;
+				}
+			}
+		}
+
+		/* This should never happen */
+		err = DB_CORRUPTION;
+	}
+func_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/***************************************************************//**
+Checks if an index entry has long enough common prefix with an
+existing record so that the intended insert of the entry must be
+changed to a modify of the existing record. In the case of a clustered
+index, the prefix must be n_unique fields long. In the case of a
+secondary index, all fields must be equal.  InnoDB never updates
+secondary index records in place, other than clearing or setting the
+delete-mark flag. We could be able to update the non-unique fields
+of a unique secondary index record by checking the cursor->up_match,
+but we do not do so, because it could have some locking implications.
+@return TRUE if the existing record should be updated; FALSE if not */
+UNIV_INLINE
+ibool
+row_ins_must_modify_rec(
+/*====================*/
+	const btr_cur_t*	cursor)	/*!< in: B-tree cursor */
+{
+	/* NOTE: (compare to the note in row_ins_duplicate_error_in_clust)
+	Because node pointers on upper levels of the B-tree may match more
+	to entry than to actual user records on the leaf level, we
+	have to check if the candidate record is actually a user record.
+	A clustered index node pointer contains index->n_unique first fields,
+	and a secondary index node pointer contains all index fields. */
+
+	return(cursor->low_match
+	       >= dict_index_get_n_unique_in_tree(cursor->index())
+	       && !page_rec_is_infimum(btr_cur_get_rec(cursor)));
+}
+
+/** Insert the externally stored fields (off-page columns)
+of a clustered index entry.
+@param[in]	entry	index entry to insert
+@param[in]	big_rec	externally stored fields
+@param[in,out]	offsets	rec_get_offsets()
+@param[in,out]	heap	memory heap
+@param[in]	thd	client connection, or NULL
+@param[in]	index	clustered index
+@return	error code
+@retval	DB_SUCCESS
+@retval DB_OUT_OF_FILE_SPACE */
+static
+dberr_t
+row_ins_index_entry_big_rec(
+	const dtuple_t*		entry,
+	const big_rec_t*	big_rec,
+	rec_offs*		offsets,
+	mem_heap_t**		heap,
+	dict_index_t*		index,
+	const void*		thd __attribute__((unused)))
+{
+	mtr_t		mtr;
+	btr_pcur_t	pcur;
+	rec_t*		rec;
+
+	pcur.btr_cur.page_cur.index = index;
+	ut_ad(index->is_primary());
+
+	DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern_latch");
+
+	mtr.start();
+	if (index->table->is_temporary()) {
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+	} else {
+		index->set_modified(mtr);
+	}
+
+	dberr_t error = btr_pcur_open(entry, PAGE_CUR_LE, BTR_MODIFY_TREE,
+				      &pcur, &mtr);
+	if (error != DB_SUCCESS) {
+		return error;
+	}
+
+	rec = btr_pcur_get_rec(&pcur);
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  ULINT_UNDEFINED, heap);
+
+	DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern");
+	error = btr_store_big_rec_extern_fields(
+		&pcur, offsets, big_rec, &mtr, BTR_STORE_INSERT);
+	DEBUG_SYNC_C_IF_THD(thd, "after_row_ins_extern");
+
+	mtr.commit();
+
+	ut_free(pcur.old_rec_buf);
+	return(error);
+}
+
+#ifdef HAVE_REPLICATION /* Working around MDEV-24622 */
+extern "C" int thd_is_slave(const MYSQL_THD thd);
+#else
+# define thd_is_slave(thd) 0
+#endif
+
+#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__
+/* Avoid GCC 4.8.5 internal compiler error due to srw_mutex::wr_unlock().
+We would only need this for row_ins_clust_index_entry_low(),
+but GCC 4.8.5 does not support pop_options. */
+# pragma GCC optimize ("O0")
+#endif
+
+/***************************************************************//**
+Tries to insert an entry into a clustered index, ignoring foreign key
+constraints. If a record with the same unique key is found, the other
+record is necessarily marked deleted by a committed transaction, or a
+unique key violation error occurs. The delete marked record is then
+updated to an existing record, and we must write an undo log record on
+the delete marked record.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed
+@return error code */
+dberr_t
+row_ins_clust_index_entry_low(
+/*==========================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		n_uniq,	/*!< in: 0 or index->n_uniq */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	btr_pcur_t	pcur;
+	dberr_t		err		= DB_SUCCESS;
+	big_rec_t*	big_rec		= NULL;
+	mtr_t		mtr;
+	uint64_t	auto_inc	= 0;
+	mem_heap_t*	offsets_heap	= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets         = offsets_;
+	rec_offs_init(offsets_);
+	trx_t*		trx	= thr_get_trx(thr);
+	buf_block_t*	block;
+
+	DBUG_ENTER("row_ins_clust_index_entry_low");
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(!dict_index_is_unique(index)
+	      || n_uniq == dict_index_get_n_unique(index));
+	ut_ad(!n_uniq || n_uniq == dict_index_get_n_unique(index));
+	ut_ad(!trx->in_rollback);
+
+	mtr.start();
+
+	if (index->table->is_temporary()) {
+		/* Disable REDO logging as the lifetime of temp-tables is
+		limited to server or connection lifetime and so REDO
+		information is not needed on restart for recovery.
+		Disable locking as temp-tables are local to a connection. */
+
+		ut_ad(flags & BTR_NO_LOCKING_FLAG);
+		ut_ad(!dict_index_is_online_ddl(index));
+		ut_ad(!index->table->persistent_autoinc);
+		ut_ad(!index->is_instant());
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+	} else {
+		index->set_modified(mtr);
+
+		if (UNIV_UNLIKELY(entry->is_metadata())) {
+			ut_ad(index->is_instant());
+			ut_ad(!dict_index_is_online_ddl(index));
+			ut_ad(mode == BTR_MODIFY_TREE);
+		} else {
+			if (mode == BTR_MODIFY_LEAF
+			    && dict_index_is_online_ddl(index)) {
+				mode = BTR_MODIFY_LEAF_ALREADY_LATCHED;
+				mtr_s_lock_index(index, &mtr);
+			}
+
+			if (unsigned ai = index->table->persistent_autoinc) {
+				/* Prepare to persist the AUTO_INCREMENT value
+				from the index entry to PAGE_ROOT_AUTO_INC. */
+				const dfield_t* dfield = dtuple_get_nth_field(
+					entry, ai - 1);
+				if (!dfield_is_null(dfield)) {
+					auto_inc = row_parse_int(
+						static_cast<const byte*>(
+							dfield->data),
+						dfield->len,
+						dfield->type.mtype,
+						dfield->type.prtype
+						& DATA_UNSIGNED);
+					if (auto_inc
+					    && mode != BTR_MODIFY_TREE) {
+						mode = btr_latch_mode(
+							BTR_MODIFY_ROOT_AND_LEAF
+							^ BTR_MODIFY_LEAF
+							^ mode);
+					}
+				}
+			}
+		}
+	}
+
+	/* Note that we use PAGE_CUR_LE as the search mode, because then
+	the function will return in both low_match and up_match of the
+	cursor sensible values */
+	pcur.btr_cur.page_cur.index = index;
+	err = btr_pcur_open(entry, PAGE_CUR_LE, mode, &pcur, &mtr);
+	if (err != DB_SUCCESS) {
+		index->table->file_unreadable = true;
+err_exit:
+		mtr.commit();
+		goto func_exit;
+	}
+
+	if (auto_inc) {
+		buf_block_t* root
+			= mtr.at_savepoint(mode != BTR_MODIFY_ROOT_AND_LEAF);
+		ut_ad(index->page == root->page.id().page_no());
+		page_set_autoinc(root, auto_inc, &mtr, false);
+	}
+
+	btr_pcur_get_btr_cur(&pcur)->thr = thr;
+
+#ifdef UNIV_DEBUG
+	{
+		page_t*	page = btr_pcur_get_page(&pcur);
+		rec_t*	first_rec = page_rec_get_next(
+			page_get_infimum_rec(page));
+
+		ut_ad(page_rec_is_supremum(first_rec)
+		      || rec_n_fields_is_sane(index, first_rec, entry));
+	}
+#endif /* UNIV_DEBUG */
+
+	block = btr_pcur_get_block(&pcur);
+
+	DBUG_EXECUTE_IF("row_ins_row_level", goto skip_bulk_insert;);
+
+	if (!(flags & BTR_NO_UNDO_LOG_FLAG)
+	    && page_is_empty(block->page.frame)
+	    && !entry->is_metadata() && !trx->duplicates
+	    && !trx->check_unique_secondary && !trx->check_foreigns
+	    && !trx->dict_operation
+	    && block->page.id().page_no() == index->page
+	    && !index->table->skip_alter_undo
+	    && !index->table->n_rec_locks
+	    && !index->table->is_active_ddl()
+	    && !index->table->has_spatial_index()
+	    && !index->table->versioned()
+	    && !thd_is_slave(trx->mysql_thd) /* FIXME: MDEV-24622 */) {
+		DEBUG_SYNC_C("empty_root_page_insert");
+
+		trx->bulk_insert = true;
+
+		if (!index->table->is_temporary()) {
+			err = lock_table(index->table, NULL, LOCK_X, thr);
+
+			if (err != DB_SUCCESS) {
+				trx->error_state = err;
+				trx->bulk_insert = false;
+				goto err_exit;
+			}
+
+			if (index->table->n_rec_locks) {
+avoid_bulk:
+				trx->bulk_insert = false;
+				goto skip_bulk_insert;
+			}
+
+#ifdef WITH_WSREP
+			if (trx->is_wsrep())
+			{
+				if (!wsrep_thd_is_local_transaction(trx->mysql_thd))
+					goto skip_bulk_insert;
+				if (wsrep_append_table_key(trx->mysql_thd, *index->table))
+				{
+					trx->error_state = DB_ROLLBACK;
+					goto err_exit;
+				}
+			}
+#endif /* WITH_WSREP */
+
+#ifdef BTR_CUR_HASH_ADAPT
+			if (btr_search_enabled) {
+				btr_search_x_lock_all();
+				index->table->bulk_trx_id = trx->id;
+				btr_search_x_unlock_all();
+			} else {
+				index->table->bulk_trx_id = trx->id;
+			}
+#else /* BTR_CUR_HASH_ADAPT */
+			index->table->bulk_trx_id = trx->id;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+			/* Write TRX_UNDO_EMPTY undo log and
+			start buffering the insert operation */
+			err = trx_undo_report_row_operation(
+				thr, index, entry,
+				nullptr, 0, nullptr, nullptr,
+				nullptr);
+
+			if (err != DB_SUCCESS) {
+				goto avoid_bulk;
+			}
+
+			goto err_exit;
+		}
+	}
+
+skip_bulk_insert:
+	if (UNIV_UNLIKELY(entry->info_bits != 0)) {
+		ut_ad(entry->is_metadata());
+		ut_ad(flags == BTR_NO_LOCKING_FLAG);
+		ut_ad(index->is_instant());
+		ut_ad(!dict_index_is_online_ddl(index));
+
+		const rec_t* rec = btr_pcur_get_rec(&pcur);
+
+		if (rec_get_info_bits(rec, page_rec_is_comp(rec))
+		    & REC_INFO_MIN_REC_FLAG) {
+			trx->error_info = index;
+			err = DB_DUPLICATE_KEY;
+			goto err_exit;
+		}
+
+		ut_ad(!row_ins_must_modify_rec(&pcur.btr_cur));
+		goto do_insert;
+	}
+
+	if (rec_is_metadata(btr_pcur_get_rec(&pcur), *index)) {
+		goto do_insert;
+	}
+
+	if (n_uniq
+	    && (pcur.btr_cur.up_match >= n_uniq
+		|| pcur.btr_cur.low_match >= n_uniq)) {
+
+		if (flags
+		    == (BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG
+			| BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG)) {
+			/* Set no locks when applying log
+			in online table rebuild. Only check for duplicates. */
+			err = row_ins_duplicate_error_in_clust_online(
+				n_uniq, entry, &pcur.btr_cur,
+				&offsets, &offsets_heap);
+
+			switch (err) {
+			case DB_SUCCESS:
+				break;
+			default:
+				ut_ad(0);
+				/* fall through */
+			case DB_SUCCESS_LOCKED_REC:
+			case DB_DUPLICATE_KEY:
+				trx->error_info = index;
+			}
+		} else {
+			/* Note that the following may return also
+			DB_LOCK_WAIT */
+
+			err = row_ins_duplicate_error_in_clust(
+				flags, &pcur.btr_cur, entry, thr);
+		}
+
+		if (err != DB_SUCCESS) {
+			goto err_exit;
+		}
+	}
+
+	/* Note: Allowing duplicates would qualify for modification of
+	an existing record as the new entry is exactly same as old entry. */
+	if (row_ins_must_modify_rec(&pcur.btr_cur)) {
+		/* There is already an index entry with a long enough common
+		prefix, we must convert the insert into a modify of an
+		existing record */
+		mem_heap_t*	entry_heap	= mem_heap_create(1024);
+
+		err = row_ins_clust_index_entry_by_modify(
+			&pcur, flags, mode, &offsets, &offsets_heap,
+			entry_heap, entry, thr, &mtr);
+
+		mtr_commit(&mtr);
+		mem_heap_free(entry_heap);
+	} else {
+		if (index->is_instant()) entry->trim(*index);
+do_insert:
+		rec_t*	insert_rec;
+
+		if (mode != BTR_MODIFY_TREE) {
+			ut_ad(mode == BTR_MODIFY_LEAF
+			      || mode == BTR_MODIFY_LEAF_ALREADY_LATCHED
+			      || mode == BTR_MODIFY_ROOT_AND_LEAF
+			      || mode
+			      == BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED);
+			err = btr_cur_optimistic_insert(
+				flags, &pcur.btr_cur, &offsets, &offsets_heap,
+				entry, &insert_rec, &big_rec,
+				n_ext, thr, &mtr);
+		} else {
+			if (buf_pool.running_out()) {
+				err = DB_LOCK_TABLE_FULL;
+				goto err_exit;
+			}
+
+			err = btr_cur_optimistic_insert(
+				flags, &pcur.btr_cur,
+				&offsets, &offsets_heap,
+				entry, &insert_rec, &big_rec,
+				n_ext, thr, &mtr);
+
+			if (err == DB_FAIL) {
+				err = btr_cur_pessimistic_insert(
+					flags, &pcur.btr_cur,
+					&offsets, &offsets_heap,
+					entry, &insert_rec, &big_rec,
+					n_ext, thr, &mtr);
+			}
+		}
+
+		mtr.commit();
+
+		if (big_rec) {
+			/* Online table rebuild could read (and
+			ignore) the incomplete record at this point.
+			If online rebuild is in progress, the
+			row_ins_index_entry_big_rec() will write log. */
+
+			DBUG_EXECUTE_IF(
+				"row_ins_extern_checkpoint",
+				log_write_up_to(mtr.commit_lsn(), true););
+			err = row_ins_index_entry_big_rec(
+				entry, big_rec, offsets, &offsets_heap, index,
+				trx->mysql_thd);
+			dtuple_convert_back_big_rec(index, entry, big_rec);
+		}
+	}
+
+func_exit:
+	if (offsets_heap != NULL) {
+		mem_heap_free(offsets_heap);
+	}
+
+	ut_free(pcur.old_rec_buf);
+	DBUG_RETURN(err);
+}
+
+/** Start a mini-transaction.
+@param[in,out]	mtr		mini-transaction
+@param[in,out]	index		secondary index */
+static void row_ins_sec_mtr_start(mtr_t *mtr, dict_index_t *index)
+{
+	ut_ad(!dict_index_is_clust(index));
+	ut_ad(mtr->is_named_space(index->table->space));
+
+	const mtr_log_t	log_mode = mtr->get_log_mode();
+
+	mtr->start();
+	index->set_modified(*mtr);
+	mtr->set_log_mode(log_mode);
+}
+
+/***************************************************************//**
+Tries to insert an entry into a secondary index. If a record with exactly the
+same fields is found, the other record is necessarily marked deleted.
+It is then unmarked. Otherwise, the entry is just inserted to the index.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_INSERT_TREE is needed
+@return error code */
+dberr_t
+row_ins_sec_index_entry_low(
+/*========================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF or BTR_INSERT_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: secondary index */
+	mem_heap_t*	offsets_heap,
+				/*!< in/out: memory heap that can be emptied */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	trx_id_t	trx_id,	/*!< in: PAGE_MAX_TRX_ID during
+				row_log_table_apply(), or 0 */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	DBUG_ENTER("row_ins_sec_index_entry_low");
+
+	btr_cur_t	cursor;
+	btr_latch_mode	search_mode	= mode;
+	dberr_t		err;
+	ulint		n_unique;
+	mtr_t		mtr;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets         = offsets_;
+	rec_offs_init(offsets_);
+	rtr_info_t	rtr_info;
+
+	ut_ad(!dict_index_is_clust(index));
+	ut_ad(mode == BTR_MODIFY_LEAF || mode == BTR_INSERT_TREE);
+
+	cursor.thr = thr;
+	cursor.rtr_info = NULL;
+	cursor.page_cur.index = index;
+	ut_ad(thr_get_trx(thr)->id != 0);
+
+	mtr.start();
+
+	if (index->table->is_temporary()) {
+		/* Disable locking, because temporary tables are never
+		shared between transactions or connections. */
+		ut_ad(flags & BTR_NO_LOCKING_FLAG);
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+	} else {
+		index->set_modified(mtr);
+	}
+
+	/* Note that we use PAGE_CUR_LE as the search mode, because then
+	the function will return in both low_match and up_match of the
+	cursor sensible values */
+
+	if (index->is_spatial()) {
+		rtr_init_rtr_info(&rtr_info, false, &cursor, index, false);
+		rtr_info_update_btr(&cursor, &rtr_info);
+
+		err = rtr_insert_leaf(&cursor, entry, search_mode, &mtr);
+
+		if (err == DB_SUCCESS && search_mode == BTR_MODIFY_LEAF
+		    && rtr_info.mbr_adj) {
+			mtr_commit(&mtr);
+			search_mode = mode = BTR_MODIFY_TREE;
+			rtr_clean_rtr_info(&rtr_info, true);
+			rtr_init_rtr_info(&rtr_info, false, &cursor,
+					  index, false);
+			rtr_info_update_btr(&cursor, &rtr_info);
+			mtr.start();
+			if (index->table->is_temporary()) {
+				mtr.set_log_mode(MTR_LOG_NO_REDO);
+			} else {
+				index->set_modified(mtr);
+			}
+			err = rtr_insert_leaf(&cursor, entry,
+					      search_mode, &mtr);
+		}
+
+		DBUG_EXECUTE_IF(
+			"rtree_test_check_count", {
+			goto func_exit;});
+
+	} else {
+		if (!index->table->is_temporary()) {
+			search_mode = btr_latch_mode(
+				search_mode
+				| (thr_get_trx(thr)->check_unique_secondary
+				   ? BTR_INSERT
+				   : BTR_INSERT | BTR_IGNORE_SEC_UNIQUE));
+		}
+
+		err = cursor.search_leaf(entry, PAGE_CUR_LE, search_mode,
+					 &mtr);
+	}
+
+	if (err != DB_SUCCESS) {
+		if (err == DB_DECRYPTION_FAILED) {
+			btr_decryption_failed(*index);
+		}
+		goto func_exit;
+	}
+
+	if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) {
+		ut_ad(!dict_index_is_spatial(index));
+		/* The insert was buffered during the search: we are done */
+		goto func_exit;
+	}
+
+#ifdef UNIV_DEBUG
+	{
+		page_t*	page = btr_cur_get_page(&cursor);
+		rec_t*	first_rec = page_rec_get_next(
+			page_get_infimum_rec(page));
+
+		ut_ad(page_rec_is_supremum(first_rec)
+		      || rec_n_fields_is_sane(index, first_rec, entry));
+	}
+#endif /* UNIV_DEBUG */
+
+	n_unique = dict_index_get_n_unique(index);
+
+	if (dict_index_is_unique(index)
+	    && (cursor.low_match >= n_unique || cursor.up_match >= n_unique)) {
+		mtr_commit(&mtr);
+
+		DEBUG_SYNC_C("row_ins_sec_index_unique");
+
+		row_ins_sec_mtr_start(&mtr, index);
+
+		err = row_ins_scan_sec_index_for_duplicate(
+			flags, index, entry, thr, &mtr, offsets_heap);
+
+		mtr_commit(&mtr);
+
+		switch (err) {
+		case DB_SUCCESS:
+			break;
+		case DB_DUPLICATE_KEY:
+			if (!index->is_committed()) {
+				ut_ad(!thr_get_trx(thr)
+				      ->dict_operation_lock_mode);
+				index->type |= DICT_CORRUPT;
+				/* Do not return any error to the
+				caller. The duplicate will be reported
+				by ALTER TABLE or CREATE UNIQUE INDEX.
+				Unfortunately we cannot report the
+				duplicate key value to the DDL thread,
+				because the altered_table object is
+				private to its call stack. */
+				err = DB_SUCCESS;
+			}
+			/* fall through */
+		default:
+			if (dict_index_is_spatial(index)) {
+				rtr_clean_rtr_info(&rtr_info, true);
+			}
+			DBUG_RETURN(err);
+		}
+
+		row_ins_sec_mtr_start(&mtr, index);
+
+		DEBUG_SYNC_C("row_ins_sec_index_entry_dup_locks_created");
+
+		/* We did not find a duplicate and we have now
+		locked with s-locks the necessary records to
+		prevent any insertion of a duplicate by another
+		transaction. Let us now reposition the cursor and
+		continue the insertion (bypassing the change buffer). */
+		err = cursor.search_leaf(
+			entry, PAGE_CUR_LE,
+			btr_latch_mode(search_mode
+				       & ~(BTR_INSERT
+					   | BTR_IGNORE_SEC_UNIQUE)),
+			&mtr);
+		if (err != DB_SUCCESS) {
+			goto func_exit;
+		}
+	}
+
+	if (row_ins_must_modify_rec(&cursor)) {
+		/* There is already an index entry with a long enough common
+		prefix, we must convert the insert into a modify of an
+		existing record */
+		offsets = rec_get_offsets(
+			btr_cur_get_rec(&cursor), index, offsets,
+			index->n_core_fields,
+			ULINT_UNDEFINED, &offsets_heap);
+
+		err = row_ins_sec_index_entry_by_modify(
+			flags, mode, &cursor, &offsets,
+			offsets_heap, heap, entry, thr, &mtr);
+
+		if (err == DB_SUCCESS && dict_index_is_spatial(index)
+		    && rtr_info.mbr_adj) {
+			err = rtr_ins_enlarge_mbr(&cursor, &mtr);
+		}
+	} else {
+		rec_t*		insert_rec;
+		big_rec_t*	big_rec;
+
+		if (mode == BTR_MODIFY_LEAF) {
+			err = btr_cur_optimistic_insert(
+				flags, &cursor, &offsets, &offsets_heap,
+				entry, &insert_rec,
+				&big_rec, 0, thr, &mtr);
+			if (err == DB_SUCCESS
+			    && dict_index_is_spatial(index)
+			    && rtr_info.mbr_adj) {
+				err = rtr_ins_enlarge_mbr(&cursor, &mtr);
+			}
+		} else {
+			if (buf_pool.running_out()) {
+				err = DB_LOCK_TABLE_FULL;
+				goto func_exit;
+			}
+
+			err = btr_cur_optimistic_insert(
+				flags, &cursor,
+				&offsets, &offsets_heap,
+				entry, &insert_rec,
+				&big_rec, 0, thr, &mtr);
+			if (err == DB_FAIL) {
+				err = btr_cur_pessimistic_insert(
+					flags, &cursor,
+					&offsets, &offsets_heap,
+					entry, &insert_rec,
+					&big_rec, 0, thr, &mtr);
+			}
+			if (err == DB_SUCCESS
+				   && dict_index_is_spatial(index)
+				   && rtr_info.mbr_adj) {
+				err = rtr_ins_enlarge_mbr(&cursor, &mtr);
+			}
+		}
+
+		if (err == DB_SUCCESS && trx_id) {
+			page_update_max_trx_id(
+				btr_cur_get_block(&cursor),
+				btr_cur_get_page_zip(&cursor),
+				trx_id, &mtr);
+		}
+
+		ut_ad(!big_rec);
+	}
+
+func_exit:
+	if (dict_index_is_spatial(index)) {
+		rtr_clean_rtr_info(&rtr_info, true);
+	}
+
+	mtr_commit(&mtr);
+	DBUG_RETURN(err);
+}
+
+/***************************************************************//**
+Inserts an entry into a clustered index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+dberr_t
+row_ins_clust_index_entry(
+/*======================*/
+	dict_index_t*	index,	/*!< in: clustered index */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+{
+	dberr_t	err;
+	ulint	n_uniq;
+
+	DBUG_ENTER("row_ins_clust_index_entry");
+
+	if (!index->table->foreign_set.empty()) {
+		err = row_ins_check_foreign_constraints(
+			index->table, index, true, entry, thr);
+		if (err != DB_SUCCESS) {
+
+			DBUG_RETURN(err);
+		}
+	}
+
+	n_uniq = dict_index_is_unique(index) ? index->n_uniq : 0;
+
+#ifdef WITH_WSREP
+	const bool skip_locking
+		= wsrep_thd_skip_locking(thr_get_trx(thr)->mysql_thd);
+	ulint	flags = index->table->no_rollback() ? BTR_NO_ROLLBACK
+		: (index->table->is_temporary() || skip_locking)
+		? BTR_NO_LOCKING_FLAG : 0;
+#ifdef UNIV_DEBUG
+	if (skip_locking && strcmp(wsrep_get_sr_table_name(),
+                                   index->table->name.m_name)) {
+		WSREP_ERROR("Record locking is disabled in this thread, "
+			    "but the table being modified is not "
+			    "`%s`: `%s`.", wsrep_get_sr_table_name(),
+			    index->table->name.m_name);
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
+#else
+	ulint	flags = index->table->no_rollback() ? BTR_NO_ROLLBACK
+		: index->table->is_temporary()
+		? BTR_NO_LOCKING_FLAG : 0;
+#endif /* WITH_WSREP */
+	const ulint	orig_n_fields = entry->n_fields;
+
+	/* For intermediate table during copy alter table,
+	   skip the undo log and record lock checking for
+	   insertion operation.
+	*/
+	if (index->table->skip_alter_undo) {
+		flags |= BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG;
+	}
+
+	/* Try first optimistic descent to the B-tree */
+	log_free_check();
+
+	err = row_ins_clust_index_entry_low(
+		flags, BTR_MODIFY_LEAF, index, n_uniq, entry,
+		n_ext, thr);
+
+	entry->n_fields = orig_n_fields;
+
+	DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+			    "after_row_ins_clust_index_entry_leaf");
+
+	if (err != DB_FAIL) {
+		DEBUG_SYNC_C("row_ins_clust_index_entry_leaf_after");
+		DBUG_RETURN(err);
+	}
+
+	/* Try then pessimistic descent to the B-tree */
+	log_free_check();
+
+	err = row_ins_clust_index_entry_low(
+		flags, BTR_MODIFY_TREE, index, n_uniq, entry,
+		n_ext, thr);
+
+	entry->n_fields = orig_n_fields;
+
+	DBUG_RETURN(err);
+}
+
+/***************************************************************//**
+Inserts an entry into a secondary index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+dberr_t
+row_ins_sec_index_entry(
+/*====================*/
+	dict_index_t*	index,	/*!< in: secondary index */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	bool		check_foreign) /*!< in: true if check
+				foreign table is needed, false otherwise */
+{
+	dberr_t		err = DB_SUCCESS;
+	mem_heap_t*	offsets_heap;
+	mem_heap_t*	heap;
+	trx_id_t	trx_id  = 0;
+
+	DBUG_EXECUTE_IF("row_ins_sec_index_entry_timeout", {
+			DBUG_SET("-d,row_ins_sec_index_entry_timeout");
+			return(DB_LOCK_WAIT);});
+
+	if (check_foreign && !index->table->foreign_set.empty()) {
+		err = row_ins_check_foreign_constraints(index->table, index,
+							false, entry, thr);
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+	}
+
+	ut_ad(thr_get_trx(thr)->id != 0);
+
+	offsets_heap = mem_heap_create(1024);
+	heap = mem_heap_create(1024);
+
+	/* Try first optimistic descent to the B-tree */
+
+	log_free_check();
+	ulint flags = index->table->is_temporary()
+		? BTR_NO_LOCKING_FLAG
+		: 0;
+
+	/* For intermediate table during copy alter table,
+	   skip the undo log and record lock checking for
+	   insertion operation.
+	*/
+	if (index->table->skip_alter_undo) {
+		trx_id = thr_get_trx(thr)->id;
+		flags |= BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG;
+	}
+
+	err = row_ins_sec_index_entry_low(
+		flags, BTR_MODIFY_LEAF, index, offsets_heap, heap, entry,
+		trx_id, thr);
+	if (err == DB_FAIL) {
+		mem_heap_empty(heap);
+
+		if (index->table->space == fil_system.sys_space
+		    && !(index->type & (DICT_UNIQUE | DICT_SPATIAL))) {
+			ibuf_free_excess_pages();
+		}
+
+		/* Try then pessimistic descent to the B-tree */
+		log_free_check();
+
+		err = row_ins_sec_index_entry_low(
+			flags, BTR_INSERT_TREE, index,
+			offsets_heap, heap, entry, 0, thr);
+	}
+
+	mem_heap_free(heap);
+	mem_heap_free(offsets_heap);
+	return(err);
+}
+
+/***************************************************************//**
+Inserts an index entry to index. Tries first optimistic, then pessimistic
+descent down the tree. If the entry matches enough to a delete marked record,
+performs the insert by updating or delete unmarking the delete marked
+record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+static
+dberr_t
+row_ins_index_entry(
+/*================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	trx_t* trx = thr_get_trx(thr);
+
+	ut_ad(trx->id || index->table->no_rollback()
+	      || index->table->is_temporary());
+
+	DBUG_EXECUTE_IF("row_ins_index_entry_timeout", {
+			DBUG_SET("-d,row_ins_index_entry_timeout");
+			return(DB_LOCK_WAIT);});
+
+	if (index->is_btree()) {
+		if (auto t= trx->check_bulk_buffer(index->table)) {
+			/* MDEV-25036 FIXME: check also foreign key
+			constraints */
+			ut_ad(!trx->check_foreigns);
+			return t->bulk_insert_buffered(*entry, *index, trx);
+		}
+	}
+
+	if (index->is_primary()) {
+		return row_ins_clust_index_entry(index, entry, thr, 0);
+	} else {
+		return row_ins_sec_index_entry(index, entry, thr);
+	}
+}
+
+
+/*****************************************************************//**
+This function generate MBR (Minimum Bounding Box) for spatial objects
+and set it to spatial index field. */
+static
+void
+row_ins_spatial_index_entry_set_mbr_field(
+/*======================================*/
+	dfield_t*	field,		/*!< in/out: mbr field */
+	const dfield_t*	row_field)	/*!< in: row field */
+{
+	ulint		dlen = 0;
+	double		mbr[SPDIMS * 2];
+
+	/* This must be a GEOMETRY datatype */
+	ut_ad(DATA_GEOMETRY_MTYPE(field->type.mtype));
+
+	const byte* dptr = static_cast<const byte*>(
+		dfield_get_data(row_field));
+	dlen = dfield_get_len(row_field);
+
+	/* obtain the MBR */
+	rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+			   static_cast<uint>(dlen - GEO_DATA_HEADER_SIZE),
+			   SPDIMS, mbr);
+
+	/* Set mbr as index entry data */
+	dfield_write_mbr(field, mbr);
+}
+
+/** Sets the values of the dtuple fields in entry from the values of appropriate
+columns in row.
+@param[in]	index	index handler
+@param[out]	entry	index entry to make
+@param[in]	row	row
+@return DB_SUCCESS if the set is successful */
+static
+dberr_t
+row_ins_index_entry_set_vals(
+	const dict_index_t*	index,
+	dtuple_t*		entry,
+	const dtuple_t*		row)
+{
+	ulint	n_fields;
+	ulint	i;
+	ulint	num_v = dtuple_get_n_v_fields(entry);
+
+	n_fields = dtuple_get_n_fields(entry);
+
+	for (i = 0; i < n_fields + num_v; i++) {
+		dict_field_t*	ind_field = NULL;
+		dfield_t*	field;
+		const dfield_t*	row_field;
+		ulint		len;
+		dict_col_t*	col;
+
+		if (i >= n_fields) {
+			/* This is virtual field */
+			field = dtuple_get_nth_v_field(entry, i - n_fields);
+			col = &dict_table_get_nth_v_col(
+				index->table, i - n_fields)->m_col;
+		} else {
+			field = dtuple_get_nth_field(entry, i);
+			ind_field = dict_index_get_nth_field(index, i);
+			col = ind_field->col;
+		}
+
+		if (col->is_virtual()) {
+			const dict_v_col_t*     v_col
+				= reinterpret_cast<const dict_v_col_t*>(col);
+			ut_ad(dtuple_get_n_fields(row)
+			      == dict_table_get_n_cols(index->table));
+			row_field = dtuple_get_nth_v_field(row, v_col->v_pos);
+		} else if (col->is_dropped()) {
+			ut_ad(index->is_primary());
+
+			if (!(col->prtype & DATA_NOT_NULL)) {
+				field->data = NULL;
+				field->len = UNIV_SQL_NULL;
+				field->type.prtype = DATA_BINARY_TYPE;
+			} else {
+				ut_ad(ind_field->fixed_len <= col->len);
+				dfield_set_data(field, field_ref_zero,
+						ind_field->fixed_len);
+				field->type.prtype = DATA_NOT_NULL;
+			}
+
+			field->type.mtype = col->len
+				? DATA_FIXBINARY : DATA_BINARY;
+			continue;
+		} else {
+			row_field = dtuple_get_nth_field(
+				row, ind_field->col->ind);
+		}
+
+		len = dfield_get_len(row_field);
+
+		/* Check column prefix indexes */
+		if (ind_field != NULL && ind_field->prefix_len > 0
+		    && len != UNIV_SQL_NULL) {
+
+			const	dict_col_t*	col
+				= dict_field_get_col(ind_field);
+
+			len = dtype_get_at_most_n_mbchars(
+				col->prtype, col->mbminlen, col->mbmaxlen,
+				ind_field->prefix_len,
+				len,
+				static_cast<const char*>(
+					dfield_get_data(row_field)));
+
+			ut_ad(!dfield_is_ext(row_field));
+		}
+
+		/* Handle spatial index. For the first field, replace
+		the data with its MBR (Minimum Bounding Box). */
+		if ((i == 0) && dict_index_is_spatial(index)) {
+			if (!row_field->data
+			    || row_field->len < GEO_DATA_HEADER_SIZE) {
+				return(DB_CANT_CREATE_GEOMETRY_OBJECT);
+			}
+			row_ins_spatial_index_entry_set_mbr_field(
+				field, row_field);
+			continue;
+		}
+
+		dfield_set_data(field, dfield_get_data(row_field), len);
+		if (dfield_is_ext(row_field)) {
+			ut_ad(dict_index_is_clust(index));
+			dfield_set_ext(field);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Inserts a single index entry to the table.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_index_entry_step(
+/*=====================*/
+	ins_node_t*	node,	/*!< in: row insert node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+
+	DBUG_ENTER("row_ins_index_entry_step");
+
+	ut_ad(dtuple_check_typed(node->row));
+
+	err = row_ins_index_entry_set_vals(node->index, *node->entry,
+					   node->row);
+
+	if (err != DB_SUCCESS) {
+		DBUG_RETURN(err);
+	}
+
+	ut_ad(dtuple_check_typed(*node->entry));
+
+	err = row_ins_index_entry(node->index, *node->entry, thr);
+
+	DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+			    "after_row_ins_index_entry_step");
+
+	DBUG_RETURN(err);
+}
+
+/***********************************************************//**
+Allocates a row id for row and inits the node->index field. */
+UNIV_INLINE
+void
+row_ins_alloc_row_id_step(
+/*======================*/
+	ins_node_t*	node)	/*!< in: row insert node */
+{
+  ut_ad(node->state == INS_NODE_ALLOC_ROW_ID);
+  if (dict_table_get_first_index(node->table)->is_gen_clust())
+    dict_sys_write_row_id(node->sys_buf, dict_sys.get_new_row_id());
+}
+
+/***********************************************************//**
+Gets a row to insert from the values list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_values(
+/*========================*/
+	ins_node_t*	node)	/*!< in: row insert node */
+{
+	que_node_t*	list_node;
+	dfield_t*	dfield;
+	dtuple_t*	row;
+	ulint		i;
+
+	/* The field values are copied in the buffers of the select node and
+	it is safe to use them until we fetch from select again: therefore
+	we can just copy the pointers */
+
+	row = node->row;
+
+	i = 0;
+	list_node = node->values_list;
+
+	while (list_node) {
+		eval_exp(list_node);
+
+		dfield = dtuple_get_nth_field(row, i);
+		dfield_copy_data(dfield, que_node_get_val(list_node));
+
+		i++;
+		list_node = que_node_get_next(list_node);
+	}
+}
+
+/***********************************************************//**
+Gets a row to insert from the select list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_select(
+/*========================*/
+	ins_node_t*	node)	/*!< in: row insert node */
+{
+	que_node_t*	list_node;
+	dfield_t*	dfield;
+	dtuple_t*	row;
+	ulint		i;
+
+	/* The field values are copied in the buffers of the select node and
+	it is safe to use them until we fetch from select again: therefore
+	we can just copy the pointers */
+
+	row = node->row;
+
+	i = 0;
+	list_node = node->select->select_list;
+
+	while (list_node) {
+		dfield = dtuple_get_nth_field(row, i);
+		dfield_copy_data(dfield, que_node_get_val(list_node));
+
+		i++;
+		list_node = que_node_get_next(list_node);
+	}
+}
+
+/***********************************************************//**
+Inserts a row to a table.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins(
+/*====*/
+	ins_node_t*	node,	/*!< in: row insert node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	DBUG_ENTER("row_ins");
+
+	DBUG_PRINT("row_ins", ("table: %s", node->table->name.m_name));
+
+	if (node->state == INS_NODE_ALLOC_ROW_ID) {
+
+		row_ins_alloc_row_id_step(node);
+
+		node->index = dict_table_get_first_index(node->table);
+		ut_ad(node->entry_list.empty() == false);
+		node->entry = node->entry_list.begin();
+
+		if (node->ins_type == INS_SEARCHED) {
+
+			row_ins_get_row_from_select(node);
+
+		} else if (node->ins_type == INS_VALUES) {
+
+			row_ins_get_row_from_values(node);
+		}
+
+		node->state = INS_NODE_INSERT_ENTRIES;
+	}
+
+	ut_ad(node->state == INS_NODE_INSERT_ENTRIES);
+
+	while (dict_index_t *index = node->index) {
+		if (index->type & (DICT_FTS | DICT_CORRUPT)
+		    || !index->is_committed()) {
+		} else if (dberr_t err = row_ins_index_entry_step(node, thr)) {
+			DBUG_RETURN(err);
+		}
+		node->index = dict_table_get_next_index(index);
+		++node->entry;
+	}
+
+	ut_ad(node->entry == node->entry_list.end());
+
+	node->state = INS_NODE_ALLOC_ROW_ID;
+
+	DBUG_RETURN(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Inserts a row to a table. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_ins_step(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ins_node_t*	node;
+	que_node_t*	parent;
+	sel_node_t*	sel_node;
+	trx_t*		trx;
+	dberr_t		err;
+
+	ut_ad(thr);
+
+	DEBUG_SYNC_C("innodb_row_ins_step_enter");
+
+	trx = thr_get_trx(thr);
+
+	node = static_cast<ins_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_INSERT);
+
+	parent = que_node_get_parent(node);
+	sel_node = node->select;
+
+	if (thr->prev_node == parent) {
+		node->state = INS_NODE_SET_IX_LOCK;
+	}
+
+	/* If this is the first time this node is executed (or when
+	execution resumes after wait for the table IX lock), set an
+	IX lock on the table and reset the possible select node. MySQL's
+	partitioned table code may also call an insert within the same
+	SQL statement AFTER it has used this table handle to do a search.
+	This happens, for example, when a row update moves it to another
+	partition. In that case, we have already set the IX lock on the
+	table during the search operation, and there is no need to set
+	it again here. But we must write trx->id to node->sys_buf. */
+
+	if (node->table->no_rollback()) {
+		/* No-rollback tables should only be written to by a
+		single thread at a time, but there can be multiple
+		concurrent readers. We must hold an open table handle. */
+		DBUG_ASSERT(node->table->get_ref_count() > 0);
+		DBUG_ASSERT(node->ins_type == INS_DIRECT);
+		/* No-rollback tables can consist only of a single index. */
+		DBUG_ASSERT(node->entry_list.size() == 1);
+		DBUG_ASSERT(UT_LIST_GET_LEN(node->table->indexes) == 1);
+		/* There should be no possibility for interruption and
+		restarting here. In theory, we could allow resumption
+		from the INS_NODE_INSERT_ENTRIES state here. */
+		DBUG_ASSERT(node->state == INS_NODE_SET_IX_LOCK);
+		node->index = dict_table_get_first_index(node->table);
+		node->entry = node->entry_list.begin();
+		node->state = INS_NODE_INSERT_ENTRIES;
+		goto do_insert;
+	}
+
+	if (node->state == INS_NODE_SET_IX_LOCK) {
+
+		node->state = INS_NODE_ALLOC_ROW_ID;
+
+		if (node->table->is_temporary()) {
+			node->trx_id = trx->id;
+		}
+
+		/* It may be that the current session has not yet started
+		its transaction, or it has been committed: */
+
+		if (trx->id == node->trx_id) {
+			/* No need to do IX-locking */
+
+			goto same_trx;
+		}
+
+		err = lock_table(node->table, NULL, LOCK_IX, thr);
+
+		DBUG_EXECUTE_IF("ib_row_ins_ix_lock_wait",
+				err = DB_LOCK_WAIT;);
+
+		if (err != DB_SUCCESS) {
+			node->state = INS_NODE_SET_IX_LOCK;
+			goto error_handling;
+		}
+
+		node->trx_id = trx->id;
+same_trx:
+		if (node->ins_type == INS_SEARCHED) {
+			/* Reset the cursor */
+			sel_node->state = SEL_NODE_OPEN;
+
+			/* Fetch a row to insert */
+
+			thr->run_node = sel_node;
+
+			return(thr);
+		}
+	}
+
+	if ((node->ins_type == INS_SEARCHED)
+	    && (sel_node->state != SEL_NODE_FETCH)) {
+
+		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+		/* No more rows to insert */
+		thr->run_node = parent;
+
+		return(thr);
+	}
+do_insert:
+	/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+	err = row_ins(node, thr);
+
+error_handling:
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		/* err == DB_LOCK_WAIT or SQL error detected */
+		return(NULL);
+	}
+
+	/* DO THE TRIGGER ACTIONS HERE */
+
+	if (node->ins_type == INS_SEARCHED) {
+		/* Fetch a row to insert */
+
+		thr->run_node = sel_node;
+	} else {
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc
new file mode 100644
index 00000000..c4f46304
--- /dev/null
+++ b/storage/innobase/row/row0log.cc
@@ -0,0 +1,4134 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0log.cc
+Modification log for online index creation and online table rebuild
+
+Created 2011-05-26 Marko Makela
+*******************************************************/
+
+#include "row0log.h"
+#include "row0row.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "row0merge.h"
+#include "row0ext.h"
+#include "log0crypt.h"
+#include "data0data.h"
+#include "que0que.h"
+#include "srv0mon.h"
+#include "handler0alter.h"
+#include "ut0stage.h"
+#include "trx0rec.h"
+
+#include <sql_class.h>
+#include <algorithm>
+#include <map>
+
+Atomic_counter<ulint> onlineddl_rowlog_rows;
+ulint onlineddl_rowlog_pct_used;
+ulint onlineddl_pct_progress;
+
+/** Table row modification operations during online table rebuild.
+Delete-marked records are not copied to the rebuilt table. */
+enum row_tab_op {
+	/** Insert a record */
+	ROW_T_INSERT = 0x41,
+	/** Update a record in place */
+	ROW_T_UPDATE,
+	/** Delete (purge) a record */
+	ROW_T_DELETE
+};
+
+/** Index record modification operations during online index creation */
+enum row_op {
+	/** Insert a record */
+	ROW_OP_INSERT = 0x61,
+	/** Delete a record */
+	ROW_OP_DELETE
+};
+
+/** Size of the modification log entry header, in bytes */
+#define ROW_LOG_HEADER_SIZE 2/*op, extra_size*/
+
+/** Log block for modifications during online ALTER TABLE */
+struct row_log_buf_t {
+	byte*		block;	/*!< file block buffer */
+	size_t		size; /*!< length of block in bytes */
+	ut_new_pfx_t	block_pfx; /*!< opaque descriptor of "block". Set
+				by ut_allocator::allocate_large() and fed to
+				ut_allocator::deallocate_large(). */
+	mrec_buf_t	buf;	/*!< buffer for accessing a record
+				that spans two blocks */
+	ulint		blocks; /*!< current position in blocks */
+	ulint		bytes;	/*!< current position within block */
+	ulonglong	total;	/*!< logical position, in bytes from
+				the start of the row_log_table log;
+				0 for row_log_online_op() and
+				row_log_apply(). */
+};
+
+/** @brief Buffer for logging modifications during online index creation
+
+All modifications to an index that is being created will be logged by
+row_log_online_op() to this buffer.
+
+All modifications to a table that is being rebuilt will be logged by
+row_log_table_delete(), row_log_table_update(), row_log_table_insert()
+to this buffer.
+
+When head.blocks == tail.blocks, the reader will access tail.block
+directly. When also head.bytes == tail.bytes, both counts will be
+reset to 0 and the file will be truncated. */
+struct row_log_t {
+	pfs_os_file_t	fd;	/*!< file descriptor */
+	mysql_mutex_t	mutex;	/*!< mutex protecting error,
+				max_trx and tail */
+	dict_table_t*	table;	/*!< table that is being rebuilt,
+				or NULL when this is a secondary
+				index that is being created online */
+	bool		same_pk;/*!< whether the definition of the PRIMARY KEY
+				has remained the same */
+	const dtuple_t*	defaults;
+				/*!< default values of added, changed columns,
+				or NULL */
+	const ulint*	col_map;/*!< mapping of old column numbers to
+				new ones, or NULL if !table */
+	dberr_t		error;	/*!< error that occurred during online
+				table rebuild */
+	/** The transaction ID of the ALTER TABLE transaction.  Any
+	concurrent DML would necessarily be logged with a larger
+	transaction ID, because ha_innobase::prepare_inplace_alter_table()
+	acts as a barrier that ensures that any concurrent transaction
+	that operates on the table would have been started after
+	ha_innobase::prepare_inplace_alter_table() returns and before
+	ha_innobase::commit_inplace_alter_table(commit=true) is invoked.
+
+	Due to the nondeterministic nature of purge and due to the
+	possibility of upgrading from an earlier version of MariaDB
+	or MySQL, it is possible that row_log_table_low() would be
+	fed DB_TRX_ID that precedes than min_trx. We must normalize
+	such references to reset_trx_id[]. */
+	trx_id_t	min_trx;
+	trx_id_t	max_trx;/*!< biggest observed trx_id in
+				row_log_online_op();
+				protected by mutex and index->lock S-latch,
+				or by index->lock X-latch only */
+	row_log_buf_t	tail;	/*!< writer context;
+				protected by mutex and index->lock S-latch,
+				or by index->lock X-latch only */
+	size_t		crypt_tail_size; /*!< size of crypt_tail_size*/
+	byte*		crypt_tail; /*!< writer context;
+				temporary buffer used in encryption,
+				decryption or NULL*/
+	row_log_buf_t	head;	/*!< reader context; protected by MDL only;
+				modifiable by row_log_apply_ops() */
+	size_t		crypt_head_size; /*!< size of crypt_tail_size*/
+	byte*		crypt_head; /*!< reader context;
+				temporary buffer used in encryption,
+				decryption or NULL */
+	const char*	path;	/*!< where to create temporary file during
+				log operation */
+	/** the number of core fields in the clustered index of the
+	source table; before row_log_table_apply() completes, the
+	table could be emptied, so that table->is_instant() no longer holds,
+	but all log records must be in the "instant" format. */
+	unsigned	n_core_fields;
+	/** the default values of non-core fields when the operation started */
+	dict_col_t::def_t* non_core_fields;
+	bool		allow_not_null; /*!< Whether the alter ignore is being
+				used or if the sql mode is non-strict mode;
+				if not, NULL values will not be converted to
+				defaults */
+	const TABLE*	old_table; /*< Use old table in case of error. */
+
+	uint64_t	n_rows; /*< Number of rows read from the table */
+
+	/** Alter table transaction. It can be used to apply the DML logs
+	into the table */
+	const trx_t*	alter_trx;
+
+	/** Determine whether the log should be in the 'instant ADD' format
+	@param[in]	index	the clustered index of the source table
+	@return	whether to use the 'instant ADD COLUMN' format */
+	bool is_instant(const dict_index_t* index) const
+	{
+		ut_ad(table);
+		ut_ad(n_core_fields <= index->n_fields);
+		return n_core_fields != index->n_fields;
+	}
+
+	const byte* instant_field_value(ulint n, ulint* len) const
+	{
+		ut_ad(n >= n_core_fields);
+		const dict_col_t::def_t& d= non_core_fields[n - n_core_fields];
+		*len = d.len;
+		return static_cast<const byte*>(d.data);
+	}
+};
+
+/** Create the file or online log if it does not exist.
+@param[in,out] log     online rebuild log
+@return true if success, false if not */
+static MY_ATTRIBUTE((warn_unused_result))
+pfs_os_file_t
+row_log_tmpfile(
+	row_log_t*	log)
+{
+	DBUG_ENTER("row_log_tmpfile");
+	if (log->fd == OS_FILE_CLOSED) {
+		log->fd = row_merge_file_create_low(log->path);
+		DBUG_EXECUTE_IF("row_log_tmpfile_fail",
+				if (log->fd != OS_FILE_CLOSED)
+					row_merge_file_destroy_low(log->fd);
+				log->fd = OS_FILE_CLOSED;);
+		if (log->fd != OS_FILE_CLOSED) {
+			MONITOR_ATOMIC_INC(MONITOR_ALTER_TABLE_LOG_FILES);
+		}
+	}
+
+	DBUG_RETURN(log->fd);
+}
+
+/** Allocate the memory for the log buffer.
+@param[in,out]	log_buf	Buffer used for log operation
+@return TRUE if success, false if not */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+row_log_block_allocate(
+	row_log_buf_t&	log_buf)
+{
+	DBUG_ENTER("row_log_block_allocate");
+	if (log_buf.block == NULL) {
+		DBUG_EXECUTE_IF(
+			"simulate_row_log_allocation_failure",
+			DBUG_RETURN(false);
+		);
+
+		log_buf.block = ut_allocator<byte>(mem_key_row_log_buf)
+			.allocate_large(srv_sort_buf_size,
+					&log_buf.block_pfx);
+
+		if (log_buf.block == NULL) {
+			DBUG_RETURN(false);
+		}
+		log_buf.size = srv_sort_buf_size;
+	}
+	DBUG_RETURN(true);
+}
+
+/** Free the log buffer.
+@param[in,out]	log_buf	Buffer used for log operation */
+static
+void
+row_log_block_free(
+	row_log_buf_t&	log_buf)
+{
+	DBUG_ENTER("row_log_block_free");
+	if (log_buf.block != NULL) {
+		ut_allocator<byte>(mem_key_row_log_buf).deallocate_large(
+			log_buf.block, &log_buf.block_pfx);
+		log_buf.block = NULL;
+	}
+	DBUG_VOID_RETURN;
+}
+
+/** Logs an operation to a secondary index that is (or was) being created.
+@param  index   index, S or X latched
+@param  tuple   index tuple
+@param  trx_id  transaction ID for insert, or 0 for delete
+@retval false if row_log_apply() failure happens
+or true otherwise */
+bool row_log_online_op(dict_index_t *index, const dtuple_t *tuple,
+                       trx_id_t trx_id)
+{
+	byte*		b;
+	ulint		extra_size;
+	ulint		size;
+	ulint		mrec_size;
+	ulint		avail_size;
+	row_log_t*	log;
+	bool		success= true;
+
+	ut_ad(dtuple_validate(tuple));
+	ut_ad(dtuple_get_n_fields(tuple) == dict_index_get_n_fields(index));
+	ut_ad(index->lock.have_x() || index->lock.have_s());
+
+	if (index->is_corrupted()) {
+		return success;
+	}
+
+	ut_ad(dict_index_is_online_ddl(index)
+	      || (index->online_log
+		  && index->online_status == ONLINE_INDEX_COMPLETE));
+
+	/* Compute the size of the record. This differs from
+	row_merge_buf_encode(), because here we do not encode
+	extra_size+1 (and reserve 0 as the end-of-chunk marker). */
+
+	size = rec_get_converted_size_temp<false>(
+		index, tuple->fields, tuple->n_fields, &extra_size);
+	ut_ad(size >= extra_size);
+	ut_ad(size <= sizeof log->tail.buf);
+
+	mrec_size = ROW_LOG_HEADER_SIZE
+		+ (extra_size >= 0x80) + size
+		+ (trx_id ? DATA_TRX_ID_LEN : 0);
+
+	log = index->online_log;
+	mysql_mutex_lock(&log->mutex);
+
+start_log:
+	if (trx_id > log->max_trx) {
+		log->max_trx = trx_id;
+	}
+
+	if (!row_log_block_allocate(log->tail)) {
+		log->error = DB_OUT_OF_MEMORY;
+		goto err_exit;
+	}
+
+	MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf);
+
+	ut_ad(log->tail.bytes < srv_sort_buf_size);
+	avail_size = srv_sort_buf_size - log->tail.bytes;
+
+	if (mrec_size > avail_size) {
+		b = log->tail.buf;
+	} else {
+		b = log->tail.block + log->tail.bytes;
+	}
+
+	if (trx_id != 0) {
+		*b++ = ROW_OP_INSERT;
+		trx_write_trx_id(b, trx_id);
+		b += DATA_TRX_ID_LEN;
+	} else {
+		*b++ = ROW_OP_DELETE;
+	}
+
+	if (extra_size < 0x80) {
+		*b++ = (byte) extra_size;
+	} else {
+		ut_ad(extra_size < 0x8000);
+		*b++ = (byte) (0x80 | (extra_size >> 8));
+		*b++ = (byte) extra_size;
+	}
+
+	rec_convert_dtuple_to_temp<false>(
+		b + extra_size, index, tuple->fields, tuple->n_fields);
+
+	b += size;
+
+	if (mrec_size >= avail_size) {
+		const os_offset_t	byte_offset
+			= (os_offset_t) log->tail.blocks
+			* srv_sort_buf_size;
+		byte*			buf = log->tail.block;
+
+		if (byte_offset + srv_sort_buf_size >= srv_online_max_size) {
+			if (index->online_status != ONLINE_INDEX_COMPLETE)
+				goto write_failed;
+			/* About to run out of log, InnoDB has to
+			apply the online log for the completed index */
+			index->lock.s_unlock();
+			dberr_t error= row_log_apply(
+				log->alter_trx, index, nullptr, nullptr);
+			index->lock.s_lock(SRW_LOCK_CALL);
+			if (error != DB_SUCCESS) {
+				/* Mark all newly added indexes
+				as corrupted */
+				log->error = error;
+				success = false;
+				goto err_exit;
+			}
+
+			/* Recheck whether the index online log */
+			if (!index->online_log) {
+				goto err_exit;
+			}
+
+			goto start_log;
+		}
+
+		if (mrec_size == avail_size) {
+			ut_ad(b == &buf[srv_sort_buf_size]);
+		} else {
+			ut_ad(b == log->tail.buf + mrec_size);
+			memcpy(buf + log->tail.bytes,
+			       log->tail.buf, avail_size);
+		}
+
+		MEM_CHECK_DEFINED(buf, srv_sort_buf_size);
+
+		if (row_log_tmpfile(log) == OS_FILE_CLOSED) {
+			log->error = DB_OUT_OF_MEMORY;
+			goto err_exit;
+		}
+
+		/* If encryption is enabled encrypt buffer before writing it
+		to file system. */
+		if (srv_encrypt_log) {
+			if (!log_tmp_block_encrypt(
+				    buf, srv_sort_buf_size,
+				    log->crypt_tail, byte_offset)) {
+				log->error = DB_DECRYPTION_FAILED;
+				goto write_failed;
+			}
+
+			srv_stats.n_rowlog_blocks_encrypted.inc();
+			buf = log->crypt_tail;
+		}
+
+		log->tail.blocks++;
+		if (os_file_write(
+			    IORequestWrite,
+			    "(modification log)",
+			    log->fd,
+			    buf, byte_offset, srv_sort_buf_size)
+		    != DB_SUCCESS) {
+write_failed:
+			index->type |= DICT_CORRUPT;
+		}
+
+		MEM_UNDEFINED(log->tail.block, srv_sort_buf_size);
+		MEM_UNDEFINED(buf, srv_sort_buf_size);
+
+		memcpy(log->tail.block, log->tail.buf + avail_size,
+		       mrec_size - avail_size);
+		log->tail.bytes = mrec_size - avail_size;
+	} else {
+		log->tail.bytes += mrec_size;
+		ut_ad(b == log->tail.block + log->tail.bytes);
+	}
+
+	MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf);
+err_exit:
+	mysql_mutex_unlock(&log->mutex);
+	return success;
+}
+
+/******************************************************//**
+Gets the error status of the online index rebuild log.
+@return DB_SUCCESS or error code */
+dberr_t
+row_log_table_get_error(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: clustered index of a table
+					that is being rebuilt online */
+{
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_is_online_ddl(index));
+	return(index->online_log->error);
+}
+
+/******************************************************//**
+Starts logging an operation to a table that is being rebuilt.
+@return pointer to log, or NULL if no logging is necessary */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+byte*
+row_log_table_open(
+/*===============*/
+	row_log_t*	log,	/*!< in/out: online rebuild log */
+	ulint		size,	/*!< in: size of log record */
+	ulint*		avail)	/*!< out: available size for log record */
+{
+	mysql_mutex_lock(&log->mutex);
+
+	MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf);
+
+	if (log->error != DB_SUCCESS) {
+err_exit:
+		mysql_mutex_unlock(&log->mutex);
+		return(NULL);
+	}
+
+	if (!row_log_block_allocate(log->tail)) {
+		log->error = DB_OUT_OF_MEMORY;
+		goto err_exit;
+	}
+
+	ut_ad(log->tail.bytes < srv_sort_buf_size);
+	*avail = srv_sort_buf_size - log->tail.bytes;
+
+	if (size > *avail) {
+		/* Make sure log->tail.buf is large enough */
+		ut_ad(size <= sizeof log->tail.buf);
+		return(log->tail.buf);
+	} else {
+		return(log->tail.block + log->tail.bytes);
+	}
+}
+
+/******************************************************//**
+Stops logging an operation to a table that is being rebuilt. */
+static MY_ATTRIBUTE((nonnull))
+void
+row_log_table_close_func(
+/*=====================*/
+	dict_index_t*	index,	/*!< in/out: online rebuilt index */
+#ifdef UNIV_DEBUG
+	const byte*	b,	/*!< in: end of log record */
+#endif /* UNIV_DEBUG */
+	ulint		size,	/*!< in: size of log record */
+	ulint		avail)	/*!< in: available size for log record */
+{
+	row_log_t*	log = index->online_log;
+
+	mysql_mutex_assert_owner(&log->mutex);
+
+	if (size >= avail) {
+		const os_offset_t	byte_offset
+			= (os_offset_t) log->tail.blocks
+			* srv_sort_buf_size;
+		byte*			buf = log->tail.block;
+
+		if (byte_offset + srv_sort_buf_size >= srv_online_max_size) {
+			goto write_failed;
+		}
+
+		if (size == avail) {
+			ut_ad(b == &buf[srv_sort_buf_size]);
+		} else {
+			ut_ad(b == log->tail.buf + size);
+			memcpy(buf + log->tail.bytes, log->tail.buf, avail);
+		}
+
+		MEM_CHECK_DEFINED(buf, srv_sort_buf_size);
+
+		if (row_log_tmpfile(log) == OS_FILE_CLOSED) {
+			log->error = DB_OUT_OF_MEMORY;
+			goto err_exit;
+		}
+
+		/* If encryption is enabled encrypt buffer before writing it
+		to file system. */
+		if (srv_encrypt_log) {
+			if (!log_tmp_block_encrypt(
+				    log->tail.block, srv_sort_buf_size,
+				    log->crypt_tail, byte_offset,
+				    index->table->space_id)) {
+				log->error = DB_DECRYPTION_FAILED;
+				goto err_exit;
+			}
+
+			srv_stats.n_rowlog_blocks_encrypted.inc();
+			buf = log->crypt_tail;
+		}
+
+		log->tail.blocks++;
+		if (os_file_write(
+			    IORequestWrite,
+			    "(modification log)",
+			    log->fd,
+			    buf, byte_offset, srv_sort_buf_size)
+		    != DB_SUCCESS) {
+write_failed:
+			log->error = DB_ONLINE_LOG_TOO_BIG;
+		}
+
+		MEM_UNDEFINED(log->tail.block, srv_sort_buf_size);
+		MEM_UNDEFINED(buf, srv_sort_buf_size);
+		memcpy(log->tail.block, log->tail.buf + avail, size - avail);
+		log->tail.bytes = size - avail;
+	} else {
+		log->tail.bytes += size;
+		ut_ad(b == log->tail.block + log->tail.bytes);
+	}
+
+	log->tail.total += size;
+	MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf);
+err_exit:
+	mysql_mutex_unlock(&log->mutex);
+
+	onlineddl_rowlog_rows++;
+	/* 10000 means 100.00%, 4525 means 45.25% */
+	onlineddl_rowlog_pct_used = static_cast<ulint>((log->tail.total * 10000) / srv_online_max_size);
+}
+
+#ifdef UNIV_DEBUG
+# define row_log_table_close(index, b, size, avail)	\
+	row_log_table_close_func(index, b, size, avail)
+#else /* UNIV_DEBUG */
+# define row_log_table_close(log, b, size, avail)	\
+	row_log_table_close_func(index, size, avail)
+#endif /* UNIV_DEBUG */
+
+/** Check whether a virtual column is indexed in the new table being
+created during alter table
+@param[in]	index	cluster index
+@param[in]	v_no	virtual column number
+@return true if it is indexed, else false */
+bool
+row_log_col_is_indexed(
+	const dict_index_t*	index,
+	ulint			v_no)
+{
+	return(dict_table_get_nth_v_col(
+		index->online_log->table, v_no)->m_col.ord_part);
+}
+
+/******************************************************//**
+Logs a delete operation to a table that is being rebuilt.
+This will be merged in row_log_table_apply_delete(). */
+void
+row_log_table_delete(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	const byte*	sys)	/*!< in: DB_TRX_ID,DB_ROLL_PTR that should
+				be logged, or NULL to use those in rec */
+{
+	ulint		old_pk_extra_size;
+	ulint		old_pk_size;
+	ulint		mrec_size;
+	ulint		avail_size;
+	mem_heap_t*	heap		= NULL;
+	const dtuple_t*	old_pk;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index));
+	ut_ad(rec_offs_size(offsets) <= sizeof index->online_log->tail.buf);
+	ut_ad(index->lock.have_any());
+
+	if (index->online_status != ONLINE_INDEX_CREATION
+	    || (index->type & DICT_CORRUPT) || index->table->corrupted
+	    || index->online_log->error != DB_SUCCESS) {
+		return;
+	}
+
+	dict_table_t* new_table = index->online_log->table;
+	dict_index_t* new_index = dict_table_get_first_index(new_table);
+
+	ut_ad(dict_index_is_clust(new_index));
+	ut_ad(!dict_index_is_online_ddl(new_index));
+	ut_ad(index->online_log->min_trx);
+
+	/* Create the tuple PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in new_table. */
+	if (index->online_log->same_pk) {
+		dtuple_t*	tuple;
+		ut_ad(new_index->n_uniq == index->n_uniq);
+
+		/* The PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR are in the first
+		fields of the record. */
+		heap = mem_heap_create(
+			DATA_TRX_ID_LEN
+			+ DTUPLE_EST_ALLOC(new_index->first_user_field()));
+		old_pk = tuple = dtuple_create(heap,
+					       new_index->first_user_field());
+		dict_index_copy_types(tuple, new_index, tuple->n_fields);
+		dtuple_set_n_fields_cmp(tuple, new_index->n_uniq);
+
+		for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++) {
+			ulint		len;
+			const void*	field	= rec_get_nth_field(
+				rec, offsets, i, &len);
+			dfield_t*	dfield	= dtuple_get_nth_field(
+				tuple, i);
+			ut_ad(len != UNIV_SQL_NULL);
+			ut_ad(!rec_offs_nth_extern(offsets, i));
+			dfield_set_data(dfield, field, len);
+		}
+
+		dfield_t* db_trx_id = dtuple_get_nth_field(
+			tuple, new_index->n_uniq);
+
+		const bool replace_sys_fields
+			= sys
+			|| trx_read_trx_id(static_cast<byte*>(db_trx_id->data))
+			< index->online_log->min_trx;
+
+		if (replace_sys_fields) {
+			if (!sys || trx_read_trx_id(sys)
+			    < index->online_log->min_trx) {
+				sys = reset_trx_id;
+			}
+
+			dfield_set_data(db_trx_id, sys, DATA_TRX_ID_LEN);
+			dfield_set_data(db_trx_id + 1, sys + DATA_TRX_ID_LEN,
+					DATA_ROLL_PTR_LEN);
+		}
+
+		ut_d(trx_id_check(db_trx_id->data,
+				  index->online_log->min_trx));
+	} else {
+		/* The PRIMARY KEY has changed. Translate the tuple. */
+		old_pk = row_log_table_get_pk(
+			rec, index, offsets, NULL, &heap);
+
+		if (!old_pk) {
+			ut_ad(index->online_log->error != DB_SUCCESS);
+			if (heap) {
+				goto func_exit;
+			}
+			return;
+		}
+	}
+
+	ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field(
+		      old_pk, old_pk->n_fields - 2)->len);
+	ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field(
+		      old_pk, old_pk->n_fields - 1)->len);
+	old_pk_size = rec_get_converted_size_temp<false>(
+		new_index, old_pk->fields, old_pk->n_fields,
+		&old_pk_extra_size);
+	ut_ad(old_pk_extra_size < 0x100);
+
+	/* 2 = 1 (extra_size) + at least 1 byte payload */
+	mrec_size = 2 + old_pk_size;
+
+	if (byte* b = row_log_table_open(index->online_log,
+					 mrec_size, &avail_size)) {
+		*b++ = ROW_T_DELETE;
+		*b++ = static_cast<byte>(old_pk_extra_size);
+
+		rec_convert_dtuple_to_temp<false>(
+			b + old_pk_extra_size, new_index,
+			old_pk->fields, old_pk->n_fields);
+
+		b += old_pk_size;
+
+		row_log_table_close(index, b, mrec_size, avail_size);
+	}
+
+func_exit:
+	mem_heap_free(heap);
+}
+
+/******************************************************//**
+Logs an insert or update to a table that is being rebuilt. */
+static
+void
+row_log_table_low_redundant(
+/*========================*/
+	const rec_t*		rec,	/*!< in: clustered index leaf
+					page record in ROW_FORMAT=REDUNDANT,
+					page X-latched */
+	dict_index_t*		index,	/*!< in/out: clustered index, S-latched
+					or X-latched */
+	bool			insert,	/*!< in: true if insert,
+					false if update */
+	const dtuple_t*		old_pk,	/*!< in: old PRIMARY KEY value
+					(if !insert and a PRIMARY KEY
+					is being created) */
+	const dict_index_t*	new_index)
+					/*!< in: clustered index of the
+					new table, not latched */
+{
+	ulint		old_pk_size;
+	ulint		old_pk_extra_size;
+	ulint		size;
+	ulint		extra_size;
+	ulint		mrec_size;
+	ulint		avail_size;
+	mem_heap_t*	heap		= NULL;
+	dtuple_t*	tuple;
+	const ulint	n_fields = rec_get_n_fields_old(rec);
+
+	ut_ad(index->n_fields >= n_fields);
+	ut_ad(index->n_fields == n_fields || index->is_instant());
+	ut_ad(dict_tf2_is_valid(index->table->flags, index->table->flags2));
+	ut_ad(!dict_table_is_comp(index->table));  /* redundant row format */
+	ut_ad(dict_index_is_clust(new_index));
+
+	heap = mem_heap_create(DTUPLE_EST_ALLOC(n_fields));
+	tuple = dtuple_create(heap, n_fields);
+	dict_index_copy_types(tuple, index, n_fields);
+
+	dtuple_set_n_fields_cmp(tuple, dict_index_get_n_unique(index));
+
+	if (rec_get_1byte_offs_flag(rec)) {
+		for (ulint i = 0; i < n_fields; i++) {
+			dfield_t*	dfield;
+			ulint		len;
+			const void*	field;
+
+			dfield = dtuple_get_nth_field(tuple, i);
+			field = rec_get_nth_field_old(rec, i, &len);
+
+			dfield_set_data(dfield, field, len);
+		}
+	} else {
+		for (ulint i = 0; i < n_fields; i++) {
+			dfield_t*	dfield;
+			ulint		len;
+			const void*	field;
+
+			dfield = dtuple_get_nth_field(tuple, i);
+			field = rec_get_nth_field_old(rec, i, &len);
+
+			dfield_set_data(dfield, field, len);
+
+			if (rec_2_is_field_extern(rec, i)) {
+				dfield_set_ext(dfield);
+			}
+		}
+	}
+
+	dfield_t* db_trx_id = dtuple_get_nth_field(tuple, index->n_uniq);
+	ut_ad(dfield_get_len(db_trx_id) == DATA_TRX_ID_LEN);
+	ut_ad(dfield_get_len(db_trx_id + 1) == DATA_ROLL_PTR_LEN);
+
+	if (trx_read_trx_id(static_cast<const byte*>
+			    (dfield_get_data(db_trx_id)))
+	    < index->online_log->min_trx) {
+		dfield_set_data(db_trx_id, reset_trx_id, DATA_TRX_ID_LEN);
+		dfield_set_data(db_trx_id + 1, reset_trx_id + DATA_TRX_ID_LEN,
+				DATA_ROLL_PTR_LEN);
+	}
+
+	const bool is_instant = index->online_log->is_instant(index);
+	rec_comp_status_t status = is_instant
+		? REC_STATUS_INSTANT : REC_STATUS_ORDINARY;
+
+	size = rec_get_converted_size_temp<true>(
+		index, tuple->fields, tuple->n_fields, &extra_size, status);
+	if (is_instant) {
+		size++;
+		extra_size++;
+	}
+
+	mrec_size = ROW_LOG_HEADER_SIZE + size + (extra_size >= 0x80);
+
+	if (insert || index->online_log->same_pk) {
+		ut_ad(!old_pk);
+		old_pk_extra_size = old_pk_size = 0;
+	} else {
+		ut_ad(old_pk);
+		ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp);
+		ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field(
+			      old_pk, old_pk->n_fields - 2)->len);
+		ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field(
+			      old_pk, old_pk->n_fields - 1)->len);
+
+		old_pk_size = rec_get_converted_size_temp<false>(
+			new_index, old_pk->fields, old_pk->n_fields,
+			&old_pk_extra_size);
+		ut_ad(old_pk_extra_size < 0x100);
+		mrec_size += 1/*old_pk_extra_size*/ + old_pk_size;
+	}
+
+	if (byte* b = row_log_table_open(index->online_log,
+					 mrec_size, &avail_size)) {
+		if (insert) {
+			*b++ = ROW_T_INSERT;
+		} else {
+			*b++ = ROW_T_UPDATE;
+
+			if (old_pk_size) {
+				*b++ = static_cast<byte>(old_pk_extra_size);
+
+				rec_convert_dtuple_to_temp<false>(
+					b + old_pk_extra_size, new_index,
+					old_pk->fields, old_pk->n_fields);
+				b += old_pk_size;
+			}
+		}
+
+		if (extra_size < 0x80) {
+			*b++ = static_cast<byte>(extra_size);
+		} else {
+			ut_ad(extra_size < 0x8000);
+			*b++ = static_cast<byte>(0x80 | (extra_size >> 8));
+			*b++ = static_cast<byte>(extra_size);
+		}
+
+		if (status == REC_STATUS_INSTANT) {
+			ut_ad(is_instant);
+			if (n_fields <= index->online_log->n_core_fields) {
+				status = REC_STATUS_ORDINARY;
+			}
+			*b = status;
+		}
+
+		rec_convert_dtuple_to_temp<true>(
+			b + extra_size, index, tuple->fields, tuple->n_fields,
+			status);
+		b += size;
+
+		row_log_table_close(index, b, mrec_size, avail_size);
+	}
+
+	mem_heap_free(heap);
+}
+
+/******************************************************//**
+Logs an insert or update to a table that is being rebuilt. */
+static
+void
+row_log_table_low(
+/*==============*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	bool		insert,	/*!< in: true if insert, false if update */
+	const dtuple_t*	old_pk)	/*!< in: old PRIMARY KEY value (if !insert
+				and a PRIMARY KEY is being created) */
+{
+	ulint			old_pk_size;
+	ulint			old_pk_extra_size;
+	ulint			extra_size;
+	ulint			mrec_size;
+	ulint			avail_size;
+	const dict_index_t*	new_index;
+	row_log_t*		log = index->online_log;
+
+	new_index = dict_table_get_first_index(log->table);
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_is_clust(new_index));
+	ut_ad(!dict_index_is_online_ddl(new_index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index));
+	ut_ad(rec_offs_size(offsets) <= sizeof log->tail.buf);
+	ut_ad(index->lock.have_any());
+
+	/* old_pk=row_log_table_get_pk() [not needed in INSERT] is a prefix
+	of the clustered index record (PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR),
+	with no information on virtual columns */
+	ut_ad(!old_pk || !insert);
+	ut_ad(!old_pk || old_pk->n_v_fields == 0);
+
+	if (index->online_status != ONLINE_INDEX_CREATION
+	    || (index->type & DICT_CORRUPT) || index->table->corrupted
+	    || log->error != DB_SUCCESS) {
+		return;
+	}
+
+	if (!rec_offs_comp(offsets)) {
+		row_log_table_low_redundant(
+			rec, index, insert, old_pk, new_index);
+		return;
+	}
+
+	ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY
+	      || rec_get_status(rec) == REC_STATUS_INSTANT);
+
+	const ulint omit_size = REC_N_NEW_EXTRA_BYTES;
+
+	const ulint rec_extra_size = rec_offs_extra_size(offsets) - omit_size;
+	const bool is_instant = log->is_instant(index);
+	extra_size = rec_extra_size + is_instant;
+
+	unsigned fake_extra_size = 0;
+	byte fake_extra_buf[3];
+	if (is_instant && UNIV_UNLIKELY(!index->is_instant())) {
+		/* The source table was emptied after ALTER TABLE
+		started, and it was converted to non-instant format.
+		Because row_log_table_apply_op() expects to find
+		all records to be logged in the same way, we will
+		be unable to copy the rec_extra_size bytes from the
+		record header, but must convert them here. */
+		unsigned n_add = index->n_fields - 1 - log->n_core_fields;
+		fake_extra_size = rec_get_n_add_field_len(n_add);
+		ut_ad(fake_extra_size == 1 || fake_extra_size == 2);
+		extra_size += fake_extra_size;
+		byte* fake_extra = fake_extra_buf + fake_extra_size;
+		rec_set_n_add_field(fake_extra, n_add);
+		ut_ad(fake_extra == fake_extra_buf);
+	}
+
+	mrec_size = ROW_LOG_HEADER_SIZE
+		+ (extra_size >= 0x80) + rec_offs_size(offsets) - omit_size
+		+ is_instant + fake_extra_size;
+
+	if (insert || log->same_pk) {
+		ut_ad(!old_pk);
+		old_pk_extra_size = old_pk_size = 0;
+	} else {
+		ut_ad(old_pk);
+		ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp);
+		ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field(
+			      old_pk, old_pk->n_fields - 2)->len);
+		ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field(
+			      old_pk, old_pk->n_fields - 1)->len);
+
+		old_pk_size = rec_get_converted_size_temp<false>(
+			new_index, old_pk->fields, old_pk->n_fields,
+			&old_pk_extra_size);
+		ut_ad(old_pk_extra_size < 0x100);
+		mrec_size += 1/*old_pk_extra_size*/ + old_pk_size;
+	}
+
+	if (byte* b = row_log_table_open(log, mrec_size, &avail_size)) {
+		if (insert) {
+			*b++ = ROW_T_INSERT;
+		} else {
+			*b++ = ROW_T_UPDATE;
+
+			if (old_pk_size) {
+				*b++ = static_cast<byte>(old_pk_extra_size);
+
+				rec_convert_dtuple_to_temp<false>(
+					b + old_pk_extra_size, new_index,
+					old_pk->fields, old_pk->n_fields);
+				b += old_pk_size;
+			}
+		}
+
+		if (extra_size < 0x80) {
+			*b++ = static_cast<byte>(extra_size);
+		} else {
+			ut_ad(extra_size < 0x8000);
+			*b++ = static_cast<byte>(0x80 | (extra_size >> 8));
+			*b++ = static_cast<byte>(extra_size);
+		}
+
+		if (is_instant) {
+			*b++ = fake_extra_size
+				? REC_STATUS_INSTANT
+				: rec_get_status(rec);
+		} else {
+			ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY);
+		}
+
+		memcpy(b, rec - rec_extra_size - omit_size, rec_extra_size);
+		b += rec_extra_size;
+		memcpy(b, fake_extra_buf + 1, fake_extra_size);
+		b += fake_extra_size;
+		ulint len;
+		ulint trx_id_offs = rec_get_nth_field_offs(
+			offsets, index->n_uniq, &len);
+		ut_ad(len == DATA_TRX_ID_LEN);
+		memcpy(b, rec, rec_offs_data_size(offsets));
+		if (trx_read_trx_id(b + trx_id_offs) < log->min_trx) {
+			memcpy(b + trx_id_offs,
+			       reset_trx_id, sizeof reset_trx_id);
+		}
+		b += rec_offs_data_size(offsets);
+
+		row_log_table_close(index, b, mrec_size, avail_size);
+	}
+}
+
+/******************************************************//**
+Logs an update to a table that is being rebuilt.
+This will be merged in row_log_table_apply_update(). */
+void
+row_log_table_update(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	const dtuple_t*	old_pk)	/*!< in: row_log_table_get_pk()
+				before the update */
+{
+	row_log_table_low(rec, index, offsets, false, old_pk);
+}
+
+/** Gets the old table column of a PRIMARY KEY column.
+@param table old table (before ALTER TABLE)
+@param col_map mapping of old column numbers to new ones
+@param col_no column position in the new table
+@return old table column, or NULL if this is an added column */
+static
+const dict_col_t*
+row_log_table_get_pk_old_col(
+/*=========================*/
+	const dict_table_t*	table,
+	const ulint*		col_map,
+	ulint			col_no)
+{
+	for (ulint i = 0; i < table->n_cols; i++) {
+		if (col_no == col_map[i]) {
+			return(dict_table_get_nth_col(table, i));
+		}
+	}
+
+	return(NULL);
+}
+
+/** Maps an old table column of a PRIMARY KEY column.
+@param[in]	ifield		clustered index field in the new table (after
+ALTER TABLE)
+@param[in]	index		the clustered index of ifield
+@param[in,out]	dfield		clustered index tuple field in the new table
+@param[in,out]	heap		memory heap for allocating dfield contents
+@param[in]	rec		clustered index leaf page record in the old
+table
+@param[in]	offsets		rec_get_offsets(rec)
+@param[in]	i		rec field corresponding to col
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED size of the old table
+@param[in]	max_len		maximum length of dfield
+@param[in]	log		row log for the table
+@retval DB_INVALID_NULL		if a NULL value is encountered
+@retval DB_TOO_BIG_INDEX_COL	if the maximum prefix length is exceeded */
+static
+dberr_t
+row_log_table_get_pk_col(
+	const dict_field_t*	ifield,
+	const dict_index_t*	index,
+	dfield_t*		dfield,
+	mem_heap_t*		heap,
+	const rec_t*		rec,
+	const rec_offs*		offsets,
+	ulint			i,
+	ulint			zip_size,
+	ulint			max_len,
+	const row_log_t*	log)
+{
+	const byte*	field;
+	ulint		len;
+
+	field = rec_get_nth_field(rec, offsets, i, &len);
+
+	if (len == UNIV_SQL_DEFAULT) {
+		field = log->instant_field_value(i, &len);
+	}
+
+	if (len == UNIV_SQL_NULL) {
+		if (!log->allow_not_null) {
+			return(DB_INVALID_NULL);
+		}
+
+		unsigned col_no= ifield->col->ind;
+		ut_ad(col_no < log->defaults->n_fields);
+
+		field = static_cast<const byte*>(
+			log->defaults->fields[col_no].data);
+		if (!field) {
+			return(DB_INVALID_NULL);
+		}
+		len = log->defaults->fields[col_no].len;
+	}
+
+	if (rec_offs_nth_extern(offsets, i)) {
+		ulint	field_len = ifield->prefix_len;
+		byte*	blob_field;
+
+		if (!field_len) {
+			field_len = ifield->fixed_len;
+			if (!field_len) {
+				field_len = max_len + 1;
+			}
+		}
+
+		blob_field = static_cast<byte*>(
+			mem_heap_alloc(heap, field_len));
+
+		len = btr_copy_externally_stored_field_prefix(
+			blob_field, field_len, zip_size, field, len);
+		if (len >= max_len + 1) {
+			return(DB_TOO_BIG_INDEX_COL);
+		}
+
+		dfield_set_data(dfield, blob_field, len);
+	} else {
+		dfield_set_data(dfield, mem_heap_dup(heap, field, len), len);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/******************************************************//**
+Constructs the old PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR
+of a table that is being rebuilt.
+@return tuple of PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in the rebuilt table,
+or NULL if the PRIMARY KEY definition does not change */
+const dtuple_t*
+row_log_table_get_pk(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	byte*		sys,	/*!< out: DB_TRX_ID,DB_ROLL_PTR for
+				row_log_table_delete(), or NULL */
+	mem_heap_t**	heap)	/*!< in/out: memory heap where allocated */
+{
+	dtuple_t*	tuple	= NULL;
+	row_log_t*	log	= index->online_log;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_is_online_ddl(index));
+	ut_ad(!offsets || rec_offs_validate(rec, index, offsets));
+	ut_ad(index->lock.have_any());
+	ut_ad(log);
+	ut_ad(log->table);
+	ut_ad(log->min_trx);
+
+	if (log->same_pk) {
+		/* The PRIMARY KEY columns are unchanged. */
+		if (sys) {
+			/* Store the DB_TRX_ID,DB_ROLL_PTR. */
+			ulint	trx_id_offs = index->trx_id_offset;
+
+			if (!trx_id_offs) {
+				ulint	len;
+
+				if (!offsets) {
+					offsets = rec_get_offsets(
+						rec, index, nullptr,
+						index->n_core_fields,
+						index->db_trx_id() + 1, heap);
+				}
+
+				trx_id_offs = rec_get_nth_field_offs(
+					offsets, index->db_trx_id(), &len);
+				ut_ad(len == DATA_TRX_ID_LEN);
+			}
+
+			const byte* ptr = trx_read_trx_id(rec + trx_id_offs)
+				< log->min_trx
+				? reset_trx_id
+				: rec + trx_id_offs;
+
+			memcpy(sys, ptr, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+			ut_d(trx_id_check(sys, log->min_trx));
+		}
+
+		return(NULL);
+	}
+
+	mysql_mutex_lock(&log->mutex);
+
+	/* log->error is protected by log->mutex. */
+	if (log->error == DB_SUCCESS) {
+		dict_table_t*	new_table	= log->table;
+		dict_index_t*	new_index
+			= dict_table_get_first_index(new_table);
+		const ulint	new_n_uniq
+			= dict_index_get_n_unique(new_index);
+
+		if (!*heap) {
+			ulint	size = 0;
+
+			if (!offsets) {
+				size += (1 + REC_OFFS_HEADER_SIZE
+					 + unsigned(index->n_fields))
+					* sizeof *offsets;
+			}
+
+			for (ulint i = 0; i < new_n_uniq; i++) {
+				size += dict_col_get_min_size(
+					dict_index_get_nth_col(new_index, i));
+			}
+
+			*heap = mem_heap_create(
+				DTUPLE_EST_ALLOC(new_n_uniq + 2) + size);
+		}
+
+		if (!offsets) {
+			offsets = rec_get_offsets(rec, index, nullptr,
+						  index->n_core_fields,
+						  ULINT_UNDEFINED, heap);
+		}
+
+		tuple = dtuple_create(*heap, new_n_uniq + 2);
+		dict_index_copy_types(tuple, new_index, tuple->n_fields);
+		dtuple_set_n_fields_cmp(tuple, new_n_uniq);
+
+		const ulint max_len = DICT_MAX_FIELD_LEN_BY_FORMAT(new_table);
+
+		const ulint zip_size = index->table->space->zip_size();
+
+		for (ulint new_i = 0; new_i < new_n_uniq; new_i++) {
+			dict_field_t*	ifield;
+			dfield_t*	dfield;
+			ulint		prtype;
+			ulint		mbminlen, mbmaxlen;
+
+			ifield = dict_index_get_nth_field(new_index, new_i);
+			dfield = dtuple_get_nth_field(tuple, new_i);
+
+			const ulint	col_no
+				= dict_field_get_col(ifield)->ind;
+
+			if (const dict_col_t* col
+			    = row_log_table_get_pk_old_col(
+				    index->table, log->col_map, col_no)) {
+				ulint	i = dict_col_get_clust_pos(col, index);
+
+				if (i == ULINT_UNDEFINED) {
+					ut_ad(0);
+					log->error = DB_CORRUPTION;
+					goto err_exit;
+				}
+
+				log->error = row_log_table_get_pk_col(
+					ifield, new_index, dfield, *heap,
+					rec, offsets, i, zip_size, max_len,
+					log);
+
+				if (log->error != DB_SUCCESS) {
+err_exit:
+					tuple = NULL;
+					goto func_exit;
+				}
+
+				mbminlen = col->mbminlen;
+				mbmaxlen = col->mbmaxlen;
+				prtype = col->prtype;
+			} else {
+				/* No matching column was found in the old
+				table, so this must be an added column.
+				Copy the default value. */
+				ut_ad(log->defaults);
+
+				dfield_copy(dfield, dtuple_get_nth_field(
+						    log->defaults, col_no));
+				mbminlen = dfield->type.mbminlen;
+				mbmaxlen = dfield->type.mbmaxlen;
+				prtype = dfield->type.prtype;
+			}
+
+			ut_ad(!dfield_is_ext(dfield));
+			ut_ad(!dfield_is_null(dfield));
+
+			if (ifield->prefix_len) {
+				ulint	len = dtype_get_at_most_n_mbchars(
+					prtype, mbminlen, mbmaxlen,
+					ifield->prefix_len,
+					dfield_get_len(dfield),
+					static_cast<const char*>(
+						dfield_get_data(dfield)));
+
+				ut_ad(len <= dfield_get_len(dfield));
+				dfield_set_len(dfield, len);
+			}
+		}
+
+		const byte* trx_roll = rec
+			+ row_get_trx_id_offset(index, offsets);
+
+		/* Copy the fields, because the fields will be updated
+		or the record may be moved somewhere else in the B-tree
+		as part of the upcoming operation. */
+		if (trx_read_trx_id(trx_roll) < log->min_trx) {
+			trx_roll = reset_trx_id;
+			if (sys) {
+				memcpy(sys, trx_roll,
+				       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+			}
+		} else if (sys) {
+			memcpy(sys, trx_roll,
+			       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+			trx_roll = sys;
+		} else {
+			trx_roll = static_cast<const byte*>(
+				mem_heap_dup(
+					*heap, trx_roll,
+					DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN));
+		}
+
+		ut_d(trx_id_check(trx_roll, log->min_trx));
+
+		dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq),
+				trx_roll, DATA_TRX_ID_LEN);
+		dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq + 1),
+				trx_roll + DATA_TRX_ID_LEN, DATA_ROLL_PTR_LEN);
+	}
+
+func_exit:
+	mysql_mutex_unlock(&log->mutex);
+	return(tuple);
+}
+
+/******************************************************//**
+Logs an insert to a table that is being rebuilt.
+This will be merged in row_log_table_apply_insert(). */
+void
+row_log_table_insert(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const rec_offs*	offsets)/*!< in: rec_get_offsets(rec,index) */
+{
+	row_log_table_low(rec, index, offsets, true, NULL);
+}
+
+/******************************************************//**
+Converts a log record to a table row.
+@return converted row, or NULL if the conversion fails */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+const dtuple_t*
+row_log_table_apply_convert_mrec(
+/*=============================*/
+	const mrec_t*		mrec,		/*!< in: merge record */
+	dict_index_t*		index,		/*!< in: index of mrec */
+	const rec_offs*		offsets,	/*!< in: offsets of mrec */
+	row_log_t*		log,		/*!< in: rebuild context */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	dberr_t*		error)		/*!< out: DB_SUCCESS or
+						DB_MISSING_HISTORY or
+						reason of failure */
+{
+	dtuple_t*	row;
+
+	log->n_rows++;
+	*error = DB_SUCCESS;
+
+	/* This is based on row_build(). */
+	if (log->defaults) {
+		row = dtuple_copy(log->defaults, heap);
+		/* dict_table_copy_types() would set the fields to NULL */
+		for (ulint i = 0; i < dict_table_get_n_cols(log->table); i++) {
+			dict_col_copy_type(
+				dict_table_get_nth_col(log->table, i),
+				dfield_get_type(dtuple_get_nth_field(row, i)));
+		}
+	} else {
+		row = dtuple_create(heap, dict_table_get_n_cols(log->table));
+		dict_table_copy_types(row, log->table);
+	}
+
+	for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) {
+		const dict_field_t*	ind_field
+			= dict_index_get_nth_field(index, i);
+
+		if (ind_field->prefix_len) {
+			/* Column prefixes can only occur in key
+			fields, which cannot be stored externally. For
+			a column prefix, there should also be the full
+			field in the clustered index tuple. The row
+			tuple comprises full fields, not prefixes. */
+			ut_ad(!rec_offs_nth_extern(offsets, i));
+			continue;
+		}
+
+		const dict_col_t*	col
+			= dict_field_get_col(ind_field);
+
+		if (col->is_dropped()) {
+			/* the column was instantly dropped earlier */
+			ut_ad(index->table->instant);
+			continue;
+		}
+
+		ulint			col_no
+			= log->col_map[dict_col_get_no(col)];
+
+		if (col_no == ULINT_UNDEFINED) {
+			/* the column is being dropped now */
+			continue;
+		}
+
+		dfield_t*	dfield
+			= dtuple_get_nth_field(row, col_no);
+
+		ulint			len;
+		const byte*		data;
+
+		if (rec_offs_nth_extern(offsets, i)) {
+			ut_ad(rec_offs_any_extern(offsets));
+			index->lock.x_lock(SRW_LOCK_CALL);
+
+			data = btr_rec_copy_externally_stored_field(
+				mrec, offsets,
+				index->table->space->zip_size(),
+				i, &len, heap);
+			ut_a(data);
+			dfield_set_data(dfield, data, len);
+
+			index->lock.x_unlock();
+		} else {
+			data = rec_get_nth_field(mrec, offsets, i, &len);
+			if (len == UNIV_SQL_DEFAULT) {
+				data = log->instant_field_value(i, &len);
+			}
+			dfield_set_data(dfield, data, len);
+		}
+
+		if (len != UNIV_SQL_NULL && col->mtype == DATA_MYSQL
+		    && col->len != len && !dict_table_is_comp(log->table)) {
+
+			ut_ad(col->len >= len);
+			if (dict_table_is_comp(index->table)) {
+				byte*	buf = (byte*) mem_heap_alloc(heap,
+								     col->len);
+				memcpy(buf, dfield->data, len);
+				memset(buf + len, 0x20, col->len - len);
+
+				dfield_set_data(dfield, buf, col->len);
+			} else {
+				/* field length mismatch should not happen
+				when rebuilding the redundant row format
+				table. */
+				ut_ad(0);
+				*error = DB_CORRUPTION;
+				return(NULL);
+			}
+		}
+
+		/* See if any columns were changed to NULL or NOT NULL. */
+		const dict_col_t*	new_col
+			= dict_table_get_nth_col(log->table, col_no);
+		ut_ad(new_col->same_format(*col));
+
+		/* Assert that prtype matches except for nullability. */
+		ut_ad(!((new_col->prtype ^ dfield_get_type(dfield)->prtype)
+			& ~(DATA_NOT_NULL | DATA_VERSIONED
+			    | CHAR_COLL_MASK << 16 | DATA_LONG_TRUE_VARCHAR)));
+
+		if (new_col->prtype == col->prtype) {
+			continue;
+		}
+
+		if ((new_col->prtype & DATA_NOT_NULL)
+		    && dfield_is_null(dfield)) {
+
+			if (!log->allow_not_null) {
+				/* We got a NULL value for a NOT NULL column. */
+				*error = DB_INVALID_NULL;
+				return NULL;
+			}
+
+			const dfield_t& default_field
+				= log->defaults->fields[col_no];
+
+			Field* field = log->old_table->field[col->ind];
+
+			field->set_warning(Sql_condition::WARN_LEVEL_WARN,
+					   WARN_DATA_TRUNCATED, 1,
+					   ulong(log->n_rows));
+
+			*dfield = default_field;
+		}
+
+		/* Adjust the DATA_NOT_NULL flag in the parsed row. */
+		dfield_get_type(dfield)->prtype = new_col->prtype;
+
+		ut_ad(dict_col_type_assert_equal(new_col,
+						 dfield_get_type(dfield)));
+	}
+
+	return(row);
+}
+
+/******************************************************//**
+Replays an insert operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_insert_low(
+/*===========================*/
+	que_thr_t*		thr,		/*!< in: query graph */
+	const dtuple_t*		row,		/*!< in: table row
+						in the old table definition */
+	mem_heap_t*		offsets_heap,	/*!< in/out: memory heap
+						that can be emptied */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	row_merge_dup_t*	dup)		/*!< in/out: for reporting
+						duplicate key errors */
+{
+	dberr_t		error;
+	dtuple_t*	entry;
+	const row_log_t*log	= dup->index->online_log;
+	dict_index_t*	index	= dict_table_get_first_index(log->table);
+	ulint		n_index = 0;
+
+	ut_ad(dtuple_validate(row));
+
+	DBUG_LOG("ib_alter_table",
+		 "insert table " << index->table->id << " (index "
+		 << index->id << "): " << rec_printer(row).str());
+
+	static const ulint	flags
+		= (BTR_CREATE_FLAG
+		   | BTR_NO_LOCKING_FLAG
+		   | BTR_NO_UNDO_LOG_FLAG
+		   | BTR_KEEP_SYS_FLAG);
+
+	entry = row_build_index_entry(row, NULL, index, heap);
+
+	error = row_ins_clust_index_entry_low(
+		flags, BTR_MODIFY_TREE, index, index->n_uniq,
+		entry, 0, thr);
+
+	switch (error) {
+	case DB_SUCCESS:
+		break;
+	case DB_SUCCESS_LOCKED_REC:
+		/* The row had already been copied to the table. */
+		return(DB_SUCCESS);
+	default:
+		return(error);
+	}
+
+	ut_ad(dict_index_is_clust(index));
+
+	for (n_index += index->type != DICT_CLUSTERED;
+	     (index = dict_table_get_next_index(index)); n_index++) {
+		if (index->type & DICT_FTS) {
+			continue;
+		}
+
+		entry = row_build_index_entry(row, NULL, index, heap);
+		error = row_ins_sec_index_entry_low(
+			flags, BTR_INSERT_TREE,
+			index, offsets_heap, heap, entry,
+			thr_get_trx(thr)->id, thr);
+
+		if (error != DB_SUCCESS) {
+			if (error == DB_DUPLICATE_KEY) {
+				thr_get_trx(thr)->error_key_num = n_index;
+			}
+			break;
+		}
+	}
+
+	return(error);
+}
+
+/******************************************************//**
+Replays an insert operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_insert(
+/*=======================*/
+	que_thr_t*		thr,		/*!< in: query graph */
+	const mrec_t*		mrec,		/*!< in: record to insert */
+	const rec_offs*		offsets,	/*!< in: offsets of mrec */
+	mem_heap_t*		offsets_heap,	/*!< in/out: memory heap
+						that can be emptied */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	row_merge_dup_t*	dup)		/*!< in/out: for reporting
+						duplicate key errors */
+{
+	row_log_t*log	= dup->index->online_log;
+	dberr_t		error;
+	const dtuple_t*	row	= row_log_table_apply_convert_mrec(
+		mrec, dup->index, offsets, log, heap, &error);
+
+	switch (error) {
+	case DB_SUCCESS:
+		ut_ad(row != NULL);
+		break;
+	default:
+		ut_ad(0);
+		/* fall through */
+	case DB_INVALID_NULL:
+		ut_ad(row == NULL);
+		return(error);
+	}
+
+	error = row_log_table_apply_insert_low(
+		thr, row, offsets_heap, heap, dup);
+	if (error != DB_SUCCESS) {
+		/* Report the erroneous row using the new
+		version of the table. */
+		innobase_row_to_mysql(dup->table, log->table, row);
+	}
+	return(error);
+}
+
+/******************************************************//**
+Deletes a record from a table that is being rebuilt.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_delete_low(
+/*===========================*/
+	btr_pcur_t*		pcur,		/*!< in/out: B-tree cursor,
+						will be trashed */
+	const rec_offs*		offsets,	/*!< in: offsets on pcur */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	mtr_t*			mtr)		/*!< in/out: mini-transaction,
+						will be committed */
+{
+	dberr_t		error;
+	row_ext_t*	ext;
+	dtuple_t*	row;
+	dict_index_t*	index	= pcur->index();
+
+	ut_ad(dict_index_is_clust(index));
+
+	DBUG_LOG("ib_alter_table",
+		 "delete table " << index->table->id << " (index "
+		 << index->id << "): "
+		 << rec_printer(btr_pcur_get_rec(pcur), offsets).str());
+
+	if (dict_table_get_next_index(index)) {
+		/* Build a row template for purging secondary index entries. */
+		row = row_build(
+			ROW_COPY_DATA, index, btr_pcur_get_rec(pcur),
+			offsets, NULL, NULL, NULL, &ext, heap);
+	} else {
+		row = NULL;
+	}
+
+	btr_cur_pessimistic_delete(&error, FALSE, btr_pcur_get_btr_cur(pcur),
+				   BTR_CREATE_FLAG, false, mtr);
+	if (error != DB_SUCCESS) {
+err_exit:
+		mtr->commit();
+		return error;
+	}
+
+	mtr->commit();
+
+	while ((index = dict_table_get_next_index(index)) != NULL) {
+		if (index->type & DICT_FTS) {
+			continue;
+		}
+
+		const dtuple_t*	entry = row_build_index_entry(
+			row, ext, index, heap);
+		mtr->start();
+		index->set_modified(*mtr);
+		pcur->btr_cur.page_cur.index = index;
+		error = btr_pcur_open(entry, PAGE_CUR_LE, BTR_PURGE_TREE, pcur,
+				      mtr);
+		if (error) {
+			goto err_exit;
+		}
+#ifdef UNIV_DEBUG
+		switch (btr_pcur_get_btr_cur(pcur)->flag) {
+		case BTR_CUR_DELETE_REF:
+		case BTR_CUR_DEL_MARK_IBUF:
+		case BTR_CUR_DELETE_IBUF:
+		case BTR_CUR_INSERT_TO_IBUF:
+			/* We did not request buffering. */
+			break;
+		case BTR_CUR_HASH:
+		case BTR_CUR_HASH_FAIL:
+		case BTR_CUR_BINARY:
+			goto flag_ok;
+		}
+		ut_ad(0);
+flag_ok:
+#endif /* UNIV_DEBUG */
+
+		if (page_rec_is_infimum(btr_pcur_get_rec(pcur))
+		    || btr_pcur_get_low_match(pcur) < index->n_uniq) {
+			/* All secondary index entries should be
+			found, because new_table is being modified by
+			this thread only, and all indexes should be
+			updated in sync. */
+			mtr->commit();
+			return(DB_INDEX_CORRUPT);
+		}
+
+		btr_cur_pessimistic_delete(&error, FALSE,
+					   btr_pcur_get_btr_cur(pcur),
+					   BTR_CREATE_FLAG, false, mtr);
+		mtr->commit();
+	}
+
+	return(error);
+}
+
+/******************************************************//**
+Replays a delete operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_delete(
+/*=======================*/
+	ulint			trx_id_col,	/*!< in: position of
+						DB_TRX_ID in the new
+						clustered index */
+	const mrec_t*		mrec,		/*!< in: merge record */
+	const rec_offs*		moffsets,	/*!< in: offsets of mrec */
+	mem_heap_t*		offsets_heap,	/*!< in/out: memory heap
+						that can be emptied */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	const row_log_t*	log)		/*!< in: online log */
+{
+	dict_table_t*	new_table = log->table;
+	dict_index_t*	index = dict_table_get_first_index(new_table);
+	dtuple_t*	old_pk;
+	mtr_t		mtr;
+	btr_pcur_t	pcur;
+	rec_offs*	offsets;
+
+	pcur.btr_cur.page_cur.index = index;
+	ut_ad(rec_offs_n_fields(moffsets) == index->first_user_field());
+	ut_ad(!rec_offs_any_extern(moffsets));
+
+	/* Convert the row to a search tuple. */
+	old_pk = dtuple_create(heap, index->n_uniq);
+	dict_index_copy_types(old_pk, index, index->n_uniq);
+
+	for (ulint i = 0; i < index->n_uniq; i++) {
+		ulint		len;
+		const void*	field;
+		field = rec_get_nth_field(mrec, moffsets, i, &len);
+		ut_ad(len != UNIV_SQL_NULL);
+		dfield_set_data(dtuple_get_nth_field(old_pk, i),
+				field, len);
+	}
+
+	mtr_start(&mtr);
+	index->set_modified(mtr);
+	dberr_t err = btr_pcur_open(old_pk, PAGE_CUR_LE, BTR_PURGE_TREE, &pcur,
+				    &mtr);
+	if (err != DB_SUCCESS) {
+		goto all_done;
+	}
+#ifdef UNIV_DEBUG
+	switch (btr_pcur_get_btr_cur(&pcur)->flag) {
+	case BTR_CUR_DELETE_REF:
+	case BTR_CUR_DEL_MARK_IBUF:
+	case BTR_CUR_DELETE_IBUF:
+	case BTR_CUR_INSERT_TO_IBUF:
+		/* We did not request buffering. */
+		break;
+	case BTR_CUR_HASH:
+	case BTR_CUR_HASH_FAIL:
+	case BTR_CUR_BINARY:
+		goto flag_ok;
+	}
+	ut_ad(0);
+flag_ok:
+#endif /* UNIV_DEBUG */
+
+	if (page_rec_is_infimum(btr_pcur_get_rec(&pcur))
+	    || btr_pcur_get_low_match(&pcur) < index->n_uniq) {
+all_done:
+		mtr_commit(&mtr);
+		/* The record was not found. All done. */
+		/* This should only happen when an earlier
+		ROW_T_INSERT was skipped or
+		ROW_T_UPDATE was interpreted as ROW_T_DELETE
+		due to BLOBs having been freed by rollback. */
+		return err;
+	}
+
+	offsets = rec_get_offsets(btr_pcur_get_rec(&pcur), index, nullptr,
+				  index->n_core_fields,
+				  ULINT_UNDEFINED, &offsets_heap);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	ut_a(!rec_offs_any_null_extern(btr_pcur_get_rec(&pcur), offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+	/* Only remove the record if DB_TRX_ID,DB_ROLL_PTR match. */
+
+	{
+		ulint		len;
+		const byte*	mrec_trx_id
+			= rec_get_nth_field(mrec, moffsets, trx_id_col, &len);
+		ut_ad(len == DATA_TRX_ID_LEN);
+		const byte*	rec_trx_id
+			= rec_get_nth_field(btr_pcur_get_rec(&pcur), offsets,
+					    trx_id_col, &len);
+		ut_ad(len == DATA_TRX_ID_LEN);
+		ut_d(trx_id_check(rec_trx_id, log->min_trx));
+		ut_d(trx_id_check(mrec_trx_id, log->min_trx));
+
+		ut_ad(rec_get_nth_field(mrec, moffsets, trx_id_col + 1, &len)
+		      == mrec_trx_id + DATA_TRX_ID_LEN);
+		ut_ad(len == DATA_ROLL_PTR_LEN);
+		ut_ad(rec_get_nth_field(btr_pcur_get_rec(&pcur), offsets,
+					trx_id_col + 1, &len)
+		      == rec_trx_id + DATA_TRX_ID_LEN);
+		ut_ad(len == DATA_ROLL_PTR_LEN);
+
+		if (memcmp(mrec_trx_id, rec_trx_id,
+			   DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) {
+			/* The ROW_T_DELETE was logged for a different
+			PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR.
+			This is possible if a ROW_T_INSERT was skipped
+			or a ROW_T_UPDATE was interpreted as ROW_T_DELETE
+			because some BLOBs were missing due to
+			(1) rolling back the initial insert, or
+			(2) purging the BLOB for a later ROW_T_DELETE
+			(3) purging 'old values' for a later ROW_T_UPDATE
+			or ROW_T_DELETE. */
+			ut_ad(!log->same_pk);
+			goto all_done;
+		}
+	}
+
+	return row_log_table_apply_delete_low(&pcur, offsets, heap, &mtr);
+}
+
+/******************************************************//**
+Replays an update operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_update(
+/*=======================*/
+	que_thr_t*		thr,		/*!< in: query graph */
+	ulint			new_trx_id_col,	/*!< in: position of
+						DB_TRX_ID in the new
+						clustered index */
+	const mrec_t*		mrec,		/*!< in: new value */
+	const rec_offs*		offsets,	/*!< in: offsets of mrec */
+	mem_heap_t*		offsets_heap,	/*!< in/out: memory heap
+						that can be emptied */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	row_merge_dup_t*	dup,		/*!< in/out: for reporting
+						duplicate key errors */
+	const dtuple_t*		old_pk)		/*!< in: PRIMARY KEY and
+						DB_TRX_ID,DB_ROLL_PTR
+						of the old value,
+						or PRIMARY KEY if same_pk */
+{
+	row_log_t*	log	= dup->index->online_log;
+	const dtuple_t*	row;
+	dict_index_t*	index	= dict_table_get_first_index(log->table);
+	mtr_t		mtr;
+	btr_pcur_t	pcur;
+	dberr_t		error;
+	ulint		n_index = 0;
+
+	pcur.btr_cur.page_cur.index = index;
+
+	ut_ad(dtuple_get_n_fields_cmp(old_pk)
+	      == dict_index_get_n_unique(index));
+	ut_ad(dtuple_get_n_fields(old_pk) - (log->same_pk ? 0 : 2)
+	      == dict_index_get_n_unique(index));
+
+	row = row_log_table_apply_convert_mrec(
+		mrec, dup->index, offsets, log, heap, &error);
+
+	switch (error) {
+	case DB_SUCCESS:
+		ut_ad(row != NULL);
+		break;
+	default:
+		ut_ad(0);
+		/* fall through */
+	case DB_INVALID_NULL:
+		ut_ad(row == NULL);
+		return(error);
+	}
+
+	mtr.start();
+	index->set_modified(mtr);
+	error = btr_pcur_open(old_pk, PAGE_CUR_LE, BTR_MODIFY_TREE, &pcur,
+			      &mtr);
+	if (error != DB_SUCCESS) {
+func_exit:
+		mtr.commit();
+func_exit_committed:
+		ut_ad(mtr.has_committed());
+		ut_free(pcur.old_rec_buf);
+
+		if (error != DB_SUCCESS) {
+			/* Report the erroneous row using the new
+			version of the table. */
+			innobase_row_to_mysql(dup->table, log->table, row);
+		}
+
+		return error;
+	}
+#ifdef UNIV_DEBUG
+	switch (btr_pcur_get_btr_cur(&pcur)->flag) {
+	case BTR_CUR_DELETE_REF:
+	case BTR_CUR_DEL_MARK_IBUF:
+	case BTR_CUR_DELETE_IBUF:
+	case BTR_CUR_INSERT_TO_IBUF:
+		ut_ad(0);/* We did not request buffering. */
+	case BTR_CUR_HASH:
+	case BTR_CUR_HASH_FAIL:
+	case BTR_CUR_BINARY:
+		break;
+	}
+#endif /* UNIV_DEBUG */
+
+	ut_ad(!page_rec_is_infimum(btr_pcur_get_rec(&pcur))
+	      && btr_pcur_get_low_match(&pcur) >= index->n_uniq);
+
+	/* Prepare to update (or delete) the record. */
+	rec_offs*		cur_offsets	= rec_get_offsets(
+		btr_pcur_get_rec(&pcur), index, nullptr, index->n_core_fields,
+		ULINT_UNDEFINED, &offsets_heap);
+
+#ifdef UNIV_DEBUG
+	if (!log->same_pk) {
+		ulint		len;
+		const byte*	rec_trx_id
+			= rec_get_nth_field(btr_pcur_get_rec(&pcur),
+					    cur_offsets, index->n_uniq, &len);
+		const dfield_t*	old_pk_trx_id
+			= dtuple_get_nth_field(old_pk, index->n_uniq);
+		ut_ad(len == DATA_TRX_ID_LEN);
+		ut_d(trx_id_check(rec_trx_id, log->min_trx));
+		ut_ad(old_pk_trx_id->len == DATA_TRX_ID_LEN);
+		ut_ad(old_pk_trx_id[1].len == DATA_ROLL_PTR_LEN);
+		ut_ad(DATA_TRX_ID_LEN
+		      + static_cast<const char*>(old_pk_trx_id->data)
+		      == old_pk_trx_id[1].data);
+		ut_d(trx_id_check(old_pk_trx_id->data, log->min_trx));
+		ut_ad(!memcmp(rec_trx_id, old_pk_trx_id->data,
+			      DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN));
+	}
+#endif
+
+	dtuple_t*	entry	= row_build_index_entry_low(
+		row, NULL, index, heap, ROW_BUILD_NORMAL);
+	upd_t*		update	= row_upd_build_difference_binary(
+		index, entry, btr_pcur_get_rec(&pcur), cur_offsets,
+		false, false, NULL, heap, dup->table, &error);
+	if (error != DB_SUCCESS || !update->n_fields) {
+		goto func_exit;
+	}
+
+	const bool	pk_updated
+		= upd_get_nth_field(update, 0)->field_no < new_trx_id_col;
+
+	if (pk_updated || rec_offs_any_extern(cur_offsets)) {
+		/* If the record contains any externally stored
+		columns, perform the update by delete and insert,
+		because we will not write any undo log that would
+		allow purge to free any orphaned externally stored
+		columns. */
+
+		if (pk_updated && log->same_pk) {
+			/* The ROW_T_UPDATE log record should only be
+			written when the PRIMARY KEY fields of the
+			record did not change in the old table.  We
+			can only get a change of PRIMARY KEY columns
+			in the rebuilt table if the PRIMARY KEY was
+			redefined (!same_pk). */
+			ut_ad(0);
+			error = DB_CORRUPTION;
+			goto func_exit;
+		}
+
+		error = row_log_table_apply_delete_low(
+			&pcur, cur_offsets, heap, &mtr);
+		ut_ad(mtr.has_committed());
+
+		if (error == DB_SUCCESS) {
+			error = row_log_table_apply_insert_low(
+				thr, row, offsets_heap, heap, dup);
+		}
+
+		goto func_exit_committed;
+	}
+
+	dtuple_t*	old_row;
+	row_ext_t*	old_ext;
+
+	if (dict_table_get_next_index(index)) {
+		/* Construct the row corresponding to the old value of
+		the record. */
+		old_row = row_build(
+			ROW_COPY_DATA, index, btr_pcur_get_rec(&pcur),
+			cur_offsets, NULL, NULL, NULL, &old_ext, heap);
+		ut_ad(old_row);
+
+		DBUG_LOG("ib_alter_table",
+			 "update table " << index->table->id
+			 << " (index " << index->id
+			 << ": " << rec_printer(old_row).str()
+			 << " to " << rec_printer(row).str());
+	} else {
+		old_row = NULL;
+		old_ext = NULL;
+	}
+
+	big_rec_t*	big_rec;
+
+	error = btr_cur_pessimistic_update(
+		BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG
+		| BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG
+		| BTR_KEEP_POS_FLAG,
+		btr_pcur_get_btr_cur(&pcur),
+		&cur_offsets, &offsets_heap, heap, &big_rec,
+		update, 0, thr, 0, &mtr);
+
+	if (big_rec) {
+		if (error == DB_SUCCESS) {
+			error = btr_store_big_rec_extern_fields(
+				&pcur, cur_offsets, big_rec, &mtr,
+				BTR_STORE_UPDATE);
+		}
+
+		dtuple_big_rec_free(big_rec);
+	}
+
+	for (n_index += index->type != DICT_CLUSTERED;
+	     (index = dict_table_get_next_index(index)); n_index++) {
+		if (!index->is_btree()) {
+			continue;
+		}
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		if (!row_upd_changes_ord_field_binary(
+			    index, update, thr, old_row, NULL)) {
+			continue;
+		}
+
+		if (dict_index_has_virtual(index)) {
+			dtuple_copy_v_fields(old_row, old_pk);
+		}
+
+		mtr.commit();
+
+		entry = row_build_index_entry(old_row, old_ext, index, heap);
+		if (!entry) {
+			ut_ad(0);
+			error = DB_CORRUPTION;
+			goto func_exit_committed;
+		}
+
+		mtr.start();
+		index->set_modified(mtr);
+		pcur.btr_cur.page_cur.index = index;
+
+		ut_free(pcur.old_rec_buf);
+		pcur.old_rec_buf = nullptr;
+
+		if (ROW_FOUND != row_search_index_entry(
+			    entry, BTR_MODIFY_TREE, &pcur, &mtr)) {
+			ut_ad(0);
+			error = DB_CORRUPTION;
+			break;
+		}
+
+		btr_cur_pessimistic_delete(
+			&error, FALSE, btr_pcur_get_btr_cur(&pcur),
+			BTR_CREATE_FLAG, false, &mtr);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		mtr.commit();
+
+		entry = row_build_index_entry(row, NULL, index, heap);
+		error = row_ins_sec_index_entry_low(
+			BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG
+			| BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG,
+			BTR_INSERT_TREE, index, offsets_heap, heap,
+			entry, thr_get_trx(thr)->id, thr);
+
+		/* Report correct index name for duplicate key error. */
+		if (error == DB_DUPLICATE_KEY) {
+			thr_get_trx(thr)->error_key_num = n_index;
+		}
+
+		mtr.start();
+		index->set_modified(mtr);
+	}
+
+	goto func_exit;
+}
+
+/******************************************************//**
+Applies an operation to a table that was rebuilt.
+@return NULL on failure (mrec corruption) or when out of data;
+pointer to next record on success */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+const mrec_t*
+row_log_table_apply_op(
+/*===================*/
+	que_thr_t*		thr,		/*!< in: query graph */
+	ulint			new_trx_id_col,	/*!< in: position of
+						DB_TRX_ID in new index */
+	row_merge_dup_t*	dup,		/*!< in/out: for reporting
+						duplicate key errors */
+	dberr_t*		error,		/*!< out: DB_SUCCESS
+						or error code */
+	mem_heap_t*		offsets_heap,	/*!< in/out: memory heap
+						that can be emptied */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	const mrec_t*		mrec,		/*!< in: merge record */
+	const mrec_t*		mrec_end,	/*!< in: end of buffer */
+	rec_offs*		offsets)	/*!< in/out: work area
+						for parsing mrec */
+{
+	row_log_t*	log	= dup->index->online_log;
+	dict_index_t*	new_index = dict_table_get_first_index(log->table);
+	ulint		extra_size;
+	const mrec_t*	next_mrec;
+	dtuple_t*	old_pk;
+
+	ut_ad(dict_index_is_clust(dup->index));
+	ut_ad(dup->index->table != log->table);
+	ut_ad(log->head.total <= log->tail.total);
+
+	*error = DB_SUCCESS;
+
+	/* 3 = 1 (op type) + 1 (extra_size) + at least 1 byte payload */
+	if (mrec + 3 >= mrec_end) {
+		return(NULL);
+	}
+
+	const bool is_instant = log->is_instant(dup->index);
+	const mrec_t* const mrec_start = mrec;
+
+	switch (*mrec++) {
+	default:
+		ut_ad(0);
+		*error = DB_CORRUPTION;
+		return(NULL);
+	case ROW_T_INSERT:
+		extra_size = *mrec++;
+
+		if (extra_size >= 0x80) {
+			/* Read another byte of extra_size. */
+
+			extra_size = (extra_size & 0x7f) << 8;
+			extra_size |= *mrec++;
+		}
+
+		mrec += extra_size;
+
+		ut_ad(extra_size || !is_instant);
+
+		if (mrec > mrec_end) {
+			return(NULL);
+		}
+
+		rec_offs_set_n_fields(offsets, dup->index->n_fields);
+		rec_init_offsets_temp(mrec, dup->index, offsets,
+				      log->n_core_fields, log->non_core_fields,
+				      is_instant
+				      ? static_cast<rec_comp_status_t>(
+					      *(mrec - extra_size))
+				      : REC_STATUS_ORDINARY);
+
+		next_mrec = mrec + rec_offs_data_size(offsets);
+
+		if (next_mrec > mrec_end) {
+			return(NULL);
+		} else {
+			log->head.total += ulint(next_mrec - mrec_start);
+			*error = row_log_table_apply_insert(
+				thr, mrec, offsets, offsets_heap,
+				heap, dup);
+		}
+		break;
+
+	case ROW_T_DELETE:
+		extra_size = *mrec++;
+		ut_ad(mrec < mrec_end);
+
+		/* We assume extra_size < 0x100 for the PRIMARY KEY prefix.
+		For fixed-length PRIMARY key columns, it is 0. */
+		mrec += extra_size;
+
+		/* The ROW_T_DELETE record was converted by
+		rec_convert_dtuple_to_temp() using new_index. */
+		ut_ad(!new_index->is_instant());
+		rec_offs_set_n_fields(offsets, new_index->first_user_field());
+		rec_init_offsets_temp(mrec, new_index, offsets);
+		next_mrec = mrec + rec_offs_data_size(offsets);
+		if (next_mrec > mrec_end) {
+			return(NULL);
+		}
+
+		log->head.total += ulint(next_mrec - mrec_start);
+
+		*error = row_log_table_apply_delete(
+			new_trx_id_col,
+			mrec, offsets, offsets_heap, heap, log);
+		break;
+
+	case ROW_T_UPDATE:
+		/* Logically, the log entry consists of the
+		(PRIMARY KEY,DB_TRX_ID) of the old value (converted
+		to the new primary key definition) followed by
+		the new value in the old table definition. If the
+		definition of the columns belonging to PRIMARY KEY
+		is not changed, the log will only contain
+		DB_TRX_ID,new_row. */
+
+		if (log->same_pk) {
+			ut_ad(new_index->n_uniq == dup->index->n_uniq);
+
+			extra_size = *mrec++;
+
+			if (extra_size >= 0x80) {
+				/* Read another byte of extra_size. */
+
+				extra_size = (extra_size & 0x7f) << 8;
+				extra_size |= *mrec++;
+			}
+
+			mrec += extra_size;
+
+			ut_ad(extra_size || !is_instant);
+
+			if (mrec > mrec_end) {
+				return(NULL);
+			}
+
+			rec_offs_set_n_fields(offsets, dup->index->n_fields);
+			rec_init_offsets_temp(mrec, dup->index, offsets,
+					      log->n_core_fields,
+					      log->non_core_fields,
+					      is_instant
+					      ? static_cast<rec_comp_status_t>(
+						      *(mrec - extra_size))
+					      : REC_STATUS_ORDINARY);
+
+			next_mrec = mrec + rec_offs_data_size(offsets);
+
+			if (next_mrec > mrec_end) {
+				return(NULL);
+			}
+
+			old_pk = dtuple_create(heap, new_index->n_uniq);
+			dict_index_copy_types(
+				old_pk, new_index, old_pk->n_fields);
+
+			/* Copy the PRIMARY KEY fields from mrec to old_pk. */
+			for (ulint i = 0; i < new_index->n_uniq; i++) {
+				const void*	field;
+				ulint		len;
+				dfield_t*	dfield;
+
+				ut_ad(!rec_offs_nth_extern(offsets, i));
+
+				field = rec_get_nth_field(
+					mrec, offsets, i, &len);
+				ut_ad(len != UNIV_SQL_NULL);
+
+				dfield = dtuple_get_nth_field(old_pk, i);
+				dfield_set_data(dfield, field, len);
+			}
+		} else {
+			/* We assume extra_size < 0x100
+			for the PRIMARY KEY prefix. */
+			mrec += *mrec + 1;
+
+			if (mrec > mrec_end) {
+				return(NULL);
+			}
+
+			/* Get offsets for PRIMARY KEY,
+			DB_TRX_ID, DB_ROLL_PTR. */
+			/* The old_pk prefix was converted by
+			rec_convert_dtuple_to_temp() using new_index. */
+			ut_ad(!new_index->is_instant());
+			rec_offs_set_n_fields(offsets,
+					      new_index->first_user_field());
+			rec_init_offsets_temp(mrec, new_index, offsets);
+
+			next_mrec = mrec + rec_offs_data_size(offsets);
+			if (next_mrec + 2 > mrec_end) {
+				return(NULL);
+			}
+
+			/* Copy the PRIMARY KEY fields and
+			DB_TRX_ID, DB_ROLL_PTR from mrec to old_pk. */
+			old_pk = dtuple_create(heap,
+					       new_index->first_user_field());
+			dict_index_copy_types(old_pk, new_index,
+					      old_pk->n_fields);
+
+			for (ulint i = 0; i < new_index->first_user_field();
+			     i++) {
+				const void*	field;
+				ulint		len;
+				dfield_t*	dfield;
+
+				ut_ad(!rec_offs_nth_extern(offsets, i));
+
+				field = rec_get_nth_field(
+					mrec, offsets, i, &len);
+				ut_ad(len != UNIV_SQL_NULL);
+
+				dfield = dtuple_get_nth_field(old_pk, i);
+				dfield_set_data(dfield, field, len);
+			}
+
+			mrec = next_mrec;
+
+			/* Fetch the new value of the row as it was
+			in the old table definition. */
+			extra_size = *mrec++;
+
+			if (extra_size >= 0x80) {
+				/* Read another byte of extra_size. */
+
+				extra_size = (extra_size & 0x7f) << 8;
+				extra_size |= *mrec++;
+			}
+
+			mrec += extra_size;
+
+			ut_ad(extra_size || !is_instant);
+
+			if (mrec > mrec_end) {
+				return(NULL);
+			}
+
+			rec_offs_set_n_fields(offsets, dup->index->n_fields);
+			rec_init_offsets_temp(mrec, dup->index, offsets,
+					      log->n_core_fields,
+					      log->non_core_fields,
+					      is_instant
+					      ? static_cast<rec_comp_status_t>(
+						      *(mrec - extra_size))
+					      : REC_STATUS_ORDINARY);
+
+			next_mrec = mrec + rec_offs_data_size(offsets);
+
+			if (next_mrec > mrec_end) {
+				return(NULL);
+			}
+		}
+
+		ut_ad(next_mrec <= mrec_end);
+		log->head.total += ulint(next_mrec - mrec_start);
+		dtuple_set_n_fields_cmp(old_pk, new_index->n_uniq);
+
+		*error = row_log_table_apply_update(
+			thr, new_trx_id_col,
+			mrec, offsets, offsets_heap, heap, dup, old_pk);
+		break;
+	}
+
+	ut_ad(log->head.total <= log->tail.total);
+	mem_heap_empty(offsets_heap);
+	mem_heap_empty(heap);
+	return(next_mrec);
+}
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+/** Estimate how much an ALTER TABLE progress should be incremented per
+one block of log applied.
+For the other phases of ALTER TABLE we increment the progress with 1 per
+page processed.
+@return amount of abstract units to add to work_completed when one block
+of log is applied.
+*/
+inline
+ulint
+row_log_progress_inc_per_block()
+{
+	/* We must increment the progress once per page (as in
+	srv_page_size, default = innodb_page_size=16KiB).
+	One block here is srv_sort_buf_size (usually 1MiB). */
+	const ulint	pages_per_block = std::max<ulint>(
+		ulint(srv_sort_buf_size >> srv_page_size_shift), 1);
+
+	/* Multiply by an artificial factor of 6 to even the pace with
+	the rest of the ALTER TABLE phases, they process page_size amount
+	of data faster. */
+	return(pages_per_block * 6);
+}
+
+/** Estimate how much work is to be done by the log apply phase
+of an ALTER TABLE for this index.
+@param[in]	index	index whose log to assess
+@return work to be done by log-apply in abstract units
+*/
+ulint
+row_log_estimate_work(
+	const dict_index_t*	index)
+{
+	if (index == NULL || index->online_log == NULL
+	    || index->online_log_is_dummy()) {
+		return(0);
+	}
+
+	const row_log_t*	l = index->online_log;
+	const ulint		bytes_left =
+		static_cast<ulint>(l->tail.total - l->head.total);
+	const ulint		blocks_left = bytes_left / srv_sort_buf_size;
+
+	return(blocks_left * row_log_progress_inc_per_block());
+}
+#else /* HAVE_PSI_STAGE_INTERFACE */
+inline
+ulint
+row_log_progress_inc_per_block()
+{
+	return(0);
+}
+#endif /* HAVE_PSI_STAGE_INTERFACE */
+
+/** Applies operations to a table was rebuilt.
+@param[in]	thr	query graph
+@param[in,out]	dup	for reporting duplicate key errors
+@param[in,out]	stage	performance schema accounting object, used by
+ALTER TABLE. If not NULL, then stage->inc() will be called for each block
+of log that is applied.
+@return DB_SUCCESS, or error code on failure */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_log_table_apply_ops(
+	que_thr_t*		thr,
+	row_merge_dup_t*	dup,
+	ut_stage_alter_t*	stage)
+{
+	dberr_t		error;
+	const mrec_t*	mrec		= NULL;
+	const mrec_t*	next_mrec;
+	const mrec_t*	mrec_end	= NULL; /* silence bogus warning */
+	const mrec_t*	next_mrec_end;
+	mem_heap_t*	heap;
+	mem_heap_t*	offsets_heap;
+	rec_offs*	offsets;
+	bool		has_index_lock;
+	dict_index_t*	index		= const_cast<dict_index_t*>(
+		dup->index);
+	dict_table_t*	new_table	= index->online_log->table;
+	dict_index_t*	new_index	= dict_table_get_first_index(
+		new_table);
+	const ulint	i		= 1 + REC_OFFS_HEADER_SIZE
+		+ std::max<ulint>(index->n_fields,
+				  new_index->first_user_field());
+	const ulint	new_trx_id_col	= dict_col_get_clust_pos(
+		dict_table_get_sys_col(new_table, DATA_TRX_ID), new_index);
+	trx_t*		trx		= thr_get_trx(thr);
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_is_online_ddl(index));
+	ut_ad(trx->mysql_thd);
+	ut_ad(index->lock.have_x());
+	ut_ad(!dict_index_is_online_ddl(new_index));
+	ut_ad(dict_col_get_clust_pos(
+		      dict_table_get_sys_col(index->table, DATA_TRX_ID), index)
+	      != ULINT_UNDEFINED);
+	ut_ad(new_trx_id_col > 0);
+	ut_ad(new_trx_id_col != ULINT_UNDEFINED);
+
+	MEM_UNDEFINED(&mrec_end, sizeof mrec_end);
+
+	offsets = static_cast<rec_offs*>(ut_malloc_nokey(i * sizeof *offsets));
+	rec_offs_set_n_alloc(offsets, i);
+	rec_offs_set_n_fields(offsets, dict_index_get_n_fields(index));
+
+	heap = mem_heap_create(srv_page_size);
+	offsets_heap = mem_heap_create(srv_page_size);
+	has_index_lock = true;
+
+next_block:
+	ut_ad(has_index_lock);
+	ut_ad(index->lock.have_u_or_x());
+	ut_ad(index->online_log->head.bytes == 0);
+
+	stage->inc(row_log_progress_inc_per_block());
+
+	if (trx_is_interrupted(trx)) {
+		goto interrupted;
+	}
+
+	if (index->is_corrupted()) {
+		error = DB_INDEX_CORRUPT;
+		goto func_exit;
+	}
+
+	ut_ad(dict_index_is_online_ddl(index));
+
+	error = index->online_log->error;
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(index->online_log->head.blocks
+			  > index->online_log->tail.blocks)) {
+unexpected_eof:
+		ib::error() << "Unexpected end of temporary file for table "
+			<< index->table->name;
+corruption:
+		error = DB_CORRUPTION;
+		goto func_exit;
+	}
+
+	if (index->online_log->head.blocks
+	    == index->online_log->tail.blocks) {
+		if (index->online_log->head.blocks) {
+#ifdef HAVE_FTRUNCATE
+			/* Truncate the file in order to save space. */
+			if (index->online_log->fd > 0
+			    && ftruncate(index->online_log->fd, 0) == -1) {
+				ib::error()
+					<< "\'" << index->name + 1
+					<< "\' failed with error "
+					<< errno << ":" << strerror(errno);
+
+				goto corruption;
+			}
+#endif /* HAVE_FTRUNCATE */
+			index->online_log->head.blocks
+				= index->online_log->tail.blocks = 0;
+		}
+
+		next_mrec = index->online_log->tail.block;
+		next_mrec_end = next_mrec + index->online_log->tail.bytes;
+
+		if (next_mrec_end == next_mrec) {
+			/* End of log reached. */
+all_done:
+			ut_ad(has_index_lock);
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->tail.blocks == 0);
+			index->online_log->head.bytes = 0;
+			index->online_log->tail.bytes = 0;
+			error = DB_SUCCESS;
+			goto func_exit;
+		}
+	} else {
+		os_offset_t	ofs;
+
+		ofs = (os_offset_t) index->online_log->head.blocks
+			* srv_sort_buf_size;
+
+		ut_ad(has_index_lock);
+		has_index_lock = false;
+		index->lock.x_unlock();
+
+		log_free_check();
+
+		ut_ad(dict_index_is_online_ddl(index));
+
+		if (!row_log_block_allocate(index->online_log->head)) {
+			error = DB_OUT_OF_MEMORY;
+			goto func_exit;
+		}
+
+		byte*			buf = index->online_log->head.block;
+
+		if (DB_SUCCESS
+		    != os_file_read(IORequestRead, index->online_log->fd,
+				    buf, ofs, srv_sort_buf_size, nullptr)) {
+			ib::error()
+				<< "Unable to read temporary file"
+				" for table " << index->table->name;
+			goto corruption;
+		}
+
+		if (srv_encrypt_log) {
+			if (!log_tmp_block_decrypt(
+				    buf, srv_sort_buf_size,
+				    index->online_log->crypt_head, ofs)) {
+				error = DB_DECRYPTION_FAILED;
+				goto func_exit;
+			}
+
+			srv_stats.n_rowlog_blocks_decrypted.inc();
+			memcpy(buf, index->online_log->crypt_head,
+			       srv_sort_buf_size);
+		}
+
+#ifdef POSIX_FADV_DONTNEED
+		/* Each block is read exactly once.  Free up the file cache. */
+		posix_fadvise(index->online_log->fd,
+			      ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+		next_mrec = index->online_log->head.block;
+		next_mrec_end = next_mrec + srv_sort_buf_size;
+	}
+
+	/* This read is not protected by index->online_log->mutex for
+	performance reasons. We will eventually notice any error that
+	was flagged by a DML thread. */
+	error = index->online_log->error;
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	if (mrec) {
+		/* A partial record was read from the previous block.
+		Copy the temporary buffer full, as we do not know the
+		length of the record. Parse subsequent records from
+		the bigger buffer index->online_log->head.block
+		or index->online_log->tail.block. */
+
+		ut_ad(mrec == index->online_log->head.buf);
+		ut_ad(mrec_end > mrec);
+		ut_ad(mrec_end < (&index->online_log->head.buf)[1]);
+
+		memcpy((mrec_t*) mrec_end, next_mrec,
+		       ulint((&index->online_log->head.buf)[1] - mrec_end));
+		mrec = row_log_table_apply_op(
+			thr, new_trx_id_col,
+			dup, &error, offsets_heap, heap,
+			index->online_log->head.buf,
+			(&index->online_log->head.buf)[1], offsets);
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		} else if (UNIV_UNLIKELY(mrec == NULL)) {
+			/* The record was not reassembled properly. */
+			goto corruption;
+		}
+		/* The record was previously found out to be
+		truncated. Now that the parse buffer was extended,
+		it should proceed beyond the old end of the buffer. */
+		ut_a(mrec > mrec_end);
+
+		index->online_log->head.bytes = ulint(mrec - mrec_end);
+		next_mrec += index->online_log->head.bytes;
+	}
+
+	ut_ad(next_mrec <= next_mrec_end);
+	/* The following loop must not be parsing the temporary
+	buffer, but head.block or tail.block. */
+
+	/* mrec!=NULL means that the next record starts from the
+	middle of the block */
+	ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0));
+
+#ifdef UNIV_DEBUG
+	if (next_mrec_end == index->online_log->head.block
+	    + srv_sort_buf_size) {
+		/* If tail.bytes == 0, next_mrec_end can also be at
+		the end of tail.block. */
+		if (index->online_log->tail.bytes == 0) {
+			ut_ad(next_mrec == next_mrec_end);
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->head.bytes == 0);
+		} else {
+			ut_ad(next_mrec == index->online_log->head.block
+			      + index->online_log->head.bytes);
+			ut_ad(index->online_log->tail.blocks
+			      > index->online_log->head.blocks);
+		}
+	} else if (next_mrec_end == index->online_log->tail.block
+		   + index->online_log->tail.bytes) {
+		ut_ad(next_mrec == index->online_log->tail.block
+		      + index->online_log->head.bytes);
+		ut_ad(index->online_log->tail.blocks == 0);
+		ut_ad(index->online_log->head.blocks == 0);
+		ut_ad(index->online_log->head.bytes
+		      <= index->online_log->tail.bytes);
+	} else {
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
+
+	mrec_end = next_mrec_end;
+
+	while (!trx_is_interrupted(trx)) {
+		mrec = next_mrec;
+		ut_ad(mrec <= mrec_end);
+
+		if (mrec == mrec_end) {
+			/* We are at the end of the log.
+			   Mark the replay all_done. */
+			if (has_index_lock) {
+				goto all_done;
+			}
+		}
+
+		if (!has_index_lock) {
+			/* We are applying operations from a different
+			block than the one that is being written to.
+			We do not hold index->lock in order to
+			allow other threads to concurrently buffer
+			modifications. */
+			ut_ad(mrec >= index->online_log->head.block);
+			ut_ad(mrec_end == index->online_log->head.block
+			      + srv_sort_buf_size);
+			ut_ad(index->online_log->head.bytes
+			      < srv_sort_buf_size);
+
+			/* Take the opportunity to do a redo log
+			checkpoint if needed. */
+			log_free_check();
+		} else {
+			/* We are applying operations from the last block.
+			Do not allow other threads to buffer anything,
+			so that we can finally catch up and synchronize. */
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(mrec_end == index->online_log->tail.block
+			      + index->online_log->tail.bytes);
+			ut_ad(mrec >= index->online_log->tail.block);
+		}
+
+		/* This read is not protected by index->online_log->mutex
+		for performance reasons. We will eventually notice any
+		error that was flagged by a DML thread. */
+		error = index->online_log->error;
+
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		}
+
+		next_mrec = row_log_table_apply_op(
+			thr, new_trx_id_col,
+			dup, &error, offsets_heap, heap,
+			mrec, mrec_end, offsets);
+
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		} else if (next_mrec == next_mrec_end) {
+			/* The record happened to end on a block boundary.
+			Do we have more blocks left? */
+			if (has_index_lock) {
+				/* The index will be locked while
+				applying the last block. */
+				goto all_done;
+			}
+
+			mrec = NULL;
+process_next_block:
+			index->lock.x_lock(SRW_LOCK_CALL);
+			has_index_lock = true;
+
+			index->online_log->head.bytes = 0;
+			index->online_log->head.blocks++;
+			goto next_block;
+		} else if (next_mrec != NULL) {
+			ut_ad(next_mrec < next_mrec_end);
+			index->online_log->head.bytes
+				+= ulint(next_mrec - mrec);
+		} else if (has_index_lock) {
+			/* When mrec is within tail.block, it should
+			be a complete record, because we are holding
+			index->lock and thus excluding the writer. */
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(mrec_end == index->online_log->tail.block
+			      + index->online_log->tail.bytes);
+			ut_ad(0);
+			goto unexpected_eof;
+		} else {
+			memcpy(index->online_log->head.buf, mrec,
+			       ulint(mrec_end - mrec));
+			mrec_end += ulint(index->online_log->head.buf - mrec);
+			mrec = index->online_log->head.buf;
+			goto process_next_block;
+		}
+	}
+
+interrupted:
+	error = DB_INTERRUPTED;
+func_exit:
+	if (!has_index_lock) {
+		index->lock.x_lock(SRW_LOCK_CALL);
+	}
+
+	mem_heap_free(offsets_heap);
+	mem_heap_free(heap);
+	row_log_block_free(index->online_log->head);
+	ut_free(offsets);
+	return(error);
+}
+
+/** Apply the row_log_table log to a table upon completing rebuild.
+@param[in]	thr		query graph
+@param[in]	old_table	old table
+@param[in,out]	table		MySQL table (for reporting duplicates)
+@param[in,out]	stage		performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_log_table() will be called initially and then
+stage->inc() will be called for each block of log that is applied.
+@param[in]	new_table	Altered table
+@return DB_SUCCESS, or error code on failure */
+dberr_t
+row_log_table_apply(
+	que_thr_t*		thr,
+	dict_table_t*		old_table,
+	struct TABLE*		table,
+	ut_stage_alter_t*	stage,
+	dict_table_t*		new_table)
+{
+	dberr_t		error;
+	dict_index_t*	clust_index;
+
+	thr_get_trx(thr)->error_key_num = 0;
+	DBUG_EXECUTE_IF("innodb_trx_duplicates",
+			thr_get_trx(thr)->duplicates = TRX_DUP_REPLACE;);
+
+	stage->begin_phase_log_table();
+
+	clust_index = dict_table_get_first_index(old_table);
+
+	if (clust_index->online_log->n_rows == 0) {
+		clust_index->online_log->n_rows = new_table->stat_n_rows;
+	}
+
+	clust_index->lock.x_lock(SRW_LOCK_CALL);
+
+	if (!clust_index->online_log) {
+		ut_ad(dict_index_get_online_status(clust_index)
+		      == ONLINE_INDEX_COMPLETE);
+		/* This function should not be called unless
+		rebuilding a table online. Build in some fault
+		tolerance. */
+		ut_ad(0);
+		error = DB_ERROR;
+	} else {
+		row_merge_dup_t	dup = {
+			clust_index, table,
+			clust_index->online_log->col_map, 0
+		};
+
+		error = row_log_table_apply_ops(thr, &dup, stage);
+
+		ut_ad(error != DB_SUCCESS
+		      || clust_index->online_log->head.total
+		      == clust_index->online_log->tail.total);
+	}
+
+	clust_index->lock.x_unlock();
+	DBUG_EXECUTE_IF("innodb_trx_duplicates",
+			thr_get_trx(thr)->duplicates = 0;);
+
+	return(error);
+}
+
+/******************************************************//**
+Allocate the row log for an index and flag the index
+for online creation.
+@retval true if success, false if not */
+bool
+row_log_allocate(
+/*=============*/
+	const trx_t*	trx,	/*!< in: the ALTER TABLE transaction */
+	dict_index_t*	index,	/*!< in/out: index */
+	dict_table_t*	table,	/*!< in/out: new table being rebuilt,
+				or NULL when creating a secondary index */
+	bool		same_pk,/*!< in: whether the definition of the
+				PRIMARY KEY has remained the same */
+	const dtuple_t*	defaults,
+				/*!< in: default values of
+				added, changed columns, or NULL */
+	const ulint*	col_map,/*!< in: mapping of old column
+				numbers to new ones, or NULL if !table */
+	const char*	path,	/*!< in: where to create temporary file */
+	const TABLE*	old_table,	/*!< in: table definition before alter */
+	const bool	allow_not_null) /*!< in: allow null to not-null
+					conversion */
+{
+	row_log_t*	log;
+	DBUG_ENTER("row_log_allocate");
+
+	ut_ad(!dict_index_is_online_ddl(index));
+	ut_ad(dict_index_is_clust(index) == !!table);
+	ut_ad(!table || index->table != table);
+	ut_ad(same_pk || table);
+	ut_ad(!table || col_map);
+	ut_ad(!defaults || col_map);
+	ut_ad(index->lock.have_u_or_x());
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+	ut_ad(trx->id);
+
+	log = static_cast<row_log_t*>(ut_malloc_nokey(sizeof *log));
+
+	if (log == NULL) {
+		DBUG_RETURN(false);
+	}
+
+	log->fd = OS_FILE_CLOSED;
+	mysql_mutex_init(index_online_log_key, &log->mutex, nullptr);
+
+	log->table = table;
+	log->same_pk = same_pk;
+	log->defaults = defaults;
+	log->col_map = col_map;
+	log->error = DB_SUCCESS;
+	log->min_trx = trx->id;
+	log->max_trx = 0;
+	log->tail.blocks = log->tail.bytes = 0;
+	log->tail.total = 0;
+	log->tail.block = log->head.block = NULL;
+	log->crypt_tail = log->crypt_head = NULL;
+	log->head.blocks = log->head.bytes = 0;
+	log->head.total = 0;
+	log->path = path;
+	log->n_core_fields = index->n_core_fields;
+	ut_ad(!table || log->is_instant(index)
+	      == (index->n_core_fields < index->n_fields));
+	log->allow_not_null = allow_not_null;
+	log->old_table = old_table;
+	log->n_rows = 0;
+
+	if (table && index->is_instant()) {
+		const unsigned n = log->n_core_fields;
+		log->non_core_fields = UT_NEW_ARRAY_NOKEY(
+			dict_col_t::def_t, index->n_fields - n);
+		for (unsigned i = n; i < index->n_fields; i++) {
+			log->non_core_fields[i - n]
+				= index->fields[i].col->def_val;
+		}
+	} else {
+		log->non_core_fields = NULL;
+	}
+
+	dict_index_set_online_status(index, ONLINE_INDEX_CREATION);
+
+	if (srv_encrypt_log) {
+		log->crypt_head_size = log->crypt_tail_size = srv_sort_buf_size;
+		log->crypt_head = static_cast<byte *>(
+			my_large_malloc(&log->crypt_head_size, MYF(MY_WME)));
+		log->crypt_tail = static_cast<byte *>(
+			my_large_malloc(&log->crypt_tail_size, MYF(MY_WME)));
+
+		if (!log->crypt_head || !log->crypt_tail) {
+			row_log_free(log);
+			DBUG_RETURN(false);
+		}
+	}
+
+	index->online_log = log;
+
+	if (!table) {
+		/* Assign the clustered index online log to table.
+		It can be used by concurrent DML to identify whether
+		the table has any online DDL */
+		index->table->indexes.start->online_log_make_dummy();
+		log->alter_trx = trx;
+	}
+
+	/* While we might be holding an exclusive data dictionary lock
+	here, in row_log_abort_sec() we will not always be holding it. Use
+	atomic operations in both cases. */
+	MONITOR_ATOMIC_INC(MONITOR_ONLINE_CREATE_INDEX);
+
+	DBUG_RETURN(true);
+}
+
+/******************************************************//**
+Free the row log for an index that was being created online. */
+void
+row_log_free(
+/*=========*/
+	row_log_t*	log)	/*!< in,own: row log */
+{
+	MONITOR_ATOMIC_DEC(MONITOR_ONLINE_CREATE_INDEX);
+
+	UT_DELETE_ARRAY(log->non_core_fields);
+	row_log_block_free(log->tail);
+	row_log_block_free(log->head);
+	row_merge_file_destroy_low(log->fd);
+
+	if (log->crypt_head) {
+		my_large_free(log->crypt_head, log->crypt_head_size);
+	}
+
+	if (log->crypt_tail) {
+		my_large_free(log->crypt_tail, log->crypt_tail_size);
+	}
+
+	mysql_mutex_destroy(&log->mutex);
+	ut_free(log);
+}
+
+/******************************************************//**
+Get the latest transaction ID that has invoked row_log_online_op()
+during online creation.
+@return latest transaction ID, or 0 if nothing was logged */
+trx_id_t
+row_log_get_max_trx(
+/*================*/
+	dict_index_t*	index)	/*!< in: index, must be locked */
+{
+	ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_CREATION);
+#ifdef SAFE_MUTEX
+	ut_ad(index->lock.have_x()
+	      || (index->lock.have_s()
+		  && mysql_mutex_is_owner(&index->online_log->mutex)));
+#endif
+	return(index->online_log->max_trx);
+}
+
+/******************************************************//**
+Applies an operation to a secondary index that was being created. */
+static MY_ATTRIBUTE((nonnull))
+void
+row_log_apply_op_low(
+/*=================*/
+	dict_index_t*	index,		/*!< in/out: index */
+	row_merge_dup_t*dup,		/*!< in/out: for reporting
+					duplicate key errors */
+	dberr_t*	error,		/*!< out: DB_SUCCESS or error code */
+	mem_heap_t*	offsets_heap,	/*!< in/out: memory heap for
+					allocating offsets; can be emptied */
+	bool		has_index_lock, /*!< in: true if holding index->lock
+					in exclusive mode */
+	enum row_op	op,		/*!< in: operation being applied */
+	trx_id_t	trx_id,		/*!< in: transaction identifier */
+	const dtuple_t*	entry)		/*!< in: row */
+{
+	mtr_t		mtr;
+	btr_cur_t	cursor;
+	rec_offs*	offsets = NULL;
+
+	ut_ad(!dict_index_is_clust(index));
+
+	ut_ad(index->lock.have_x() == has_index_lock);
+
+	ut_ad(!index->is_corrupted());
+	ut_ad(trx_id != 0 || op == ROW_OP_DELETE);
+
+	DBUG_LOG("ib_create_index",
+		 (op == ROW_OP_INSERT ? "insert " : "delete ")
+		 << (has_index_lock ? "locked index " : "unlocked index ")
+		 << index->id << ',' << ib::hex(trx_id) << ": "
+		 << rec_printer(entry).str());
+
+	mtr_start(&mtr);
+	index->set_modified(mtr);
+	cursor.page_cur.index = index;
+	if (has_index_lock) {
+		mtr_x_lock_index(index, &mtr);
+	}
+
+	/* We perform the pessimistic variant of the operations if we
+	already hold index->lock exclusively. First, search the
+	record. The operation may already have been performed,
+	depending on when the row in the clustered index was
+	scanned. */
+	*error = cursor.search_leaf(entry, PAGE_CUR_LE, has_index_lock
+				    ? BTR_MODIFY_TREE_ALREADY_LATCHED
+				    : BTR_MODIFY_LEAF, &mtr);
+	if (UNIV_UNLIKELY(*error != DB_SUCCESS)) {
+		goto func_exit;
+	}
+
+	ut_ad(dict_index_get_n_unique(index) > 0);
+	/* This test is somewhat similar to row_ins_must_modify_rec(),
+	but not identical for unique secondary indexes. */
+	if (cursor.low_match >= dict_index_get_n_unique(index)
+	    && !page_rec_is_infimum(btr_cur_get_rec(&cursor))) {
+		/* We have a matching record. */
+		bool	exists	= (cursor.low_match
+				   == dict_index_get_n_fields(index));
+#ifdef UNIV_DEBUG
+		rec_t*	rec	= btr_cur_get_rec(&cursor);
+		ut_ad(page_rec_is_user_rec(rec));
+		ut_ad(!rec_get_deleted_flag(rec, page_rec_is_comp(rec)));
+#endif /* UNIV_DEBUG */
+
+		ut_ad(exists || dict_index_is_unique(index));
+
+		switch (op) {
+		case ROW_OP_DELETE:
+			if (!exists) {
+				/* The existing record matches the
+				unique secondary index key, but the
+				PRIMARY KEY columns differ. So, this
+				exact record does not exist. For
+				example, we could detect a duplicate
+				key error in some old index before
+				logging an ROW_OP_INSERT for our
+				index. This ROW_OP_DELETE could have
+				been logged for rolling back
+				TRX_UNDO_INSERT_REC. */
+				goto func_exit;
+			}
+
+			*error = btr_cur_optimistic_delete(
+				&cursor, BTR_CREATE_FLAG, &mtr);
+
+			if (*error != DB_FAIL) {
+				break;
+			}
+
+			if (!has_index_lock) {
+				/* This needs a pessimistic operation.
+				Lock the index tree exclusively. */
+				mtr_commit(&mtr);
+				mtr_start(&mtr);
+				index->set_modified(mtr);
+				*error = cursor.search_leaf(entry, PAGE_CUR_LE,
+							    BTR_MODIFY_TREE,
+							    &mtr);
+				if (UNIV_UNLIKELY(*error != DB_SUCCESS)) {
+					goto func_exit;
+				}
+				/* No other thread than the current one
+				is allowed to modify the index tree.
+				Thus, the record should still exist. */
+				ut_ad(cursor.low_match
+				      >= dict_index_get_n_fields(index));
+				ut_ad(page_rec_is_user_rec(
+					      btr_cur_get_rec(&cursor)));
+			}
+
+			/* As there are no externally stored fields in
+			a secondary index record, the parameter
+			rollback=false will be ignored. */
+
+			btr_cur_pessimistic_delete(
+				error, FALSE, &cursor,
+				BTR_CREATE_FLAG, false, &mtr);
+			break;
+		case ROW_OP_INSERT:
+			if (exists) {
+				/* The record already exists. There
+				is nothing to be inserted.
+				This could happen when processing
+				TRX_UNDO_DEL_MARK_REC in statement
+				rollback:
+
+				UPDATE of PRIMARY KEY can lead to
+				statement rollback if the updated
+				value of the PRIMARY KEY already
+				exists. In this case, the UPDATE would
+				be mapped to DELETE;INSERT, and we
+				only wrote undo log for the DELETE
+				part. The duplicate key error would be
+				triggered before logging the INSERT
+				part.
+
+				Theoretically, we could also get a
+				similar situation when a DELETE operation
+				is blocked by a FOREIGN KEY constraint. */
+				goto func_exit;
+			}
+
+			if (dtuple_contains_null(entry)) {
+				/* The UNIQUE KEY columns match, but
+				there is a NULL value in the key, and
+				NULL!=NULL. */
+				goto insert_the_rec;
+			}
+
+			goto duplicate;
+		}
+	} else {
+		switch (op) {
+			rec_t*		rec;
+			big_rec_t*	big_rec;
+		case ROW_OP_DELETE:
+			/* The record does not exist. For example, we
+			could detect a duplicate key error in some old
+			index before logging an ROW_OP_INSERT for our
+			index. This ROW_OP_DELETE could be logged for
+			rolling back TRX_UNDO_INSERT_REC. */
+			goto func_exit;
+		case ROW_OP_INSERT:
+			if (dict_index_is_unique(index)
+			    && (cursor.up_match
+				>= dict_index_get_n_unique(index)
+				|| cursor.low_match
+				>= dict_index_get_n_unique(index))
+			    && (!index->n_nullable
+				|| !dtuple_contains_null(entry))) {
+duplicate:
+				/* Duplicate key */
+				ut_ad(dict_index_is_unique(index));
+				row_merge_dup_report(dup, entry->fields);
+				*error = DB_DUPLICATE_KEY;
+				goto func_exit;
+			}
+insert_the_rec:
+			/* Insert the record. As we are inserting into
+			a secondary index, there cannot be externally
+			stored columns (!big_rec). */
+			*error = btr_cur_optimistic_insert(
+				BTR_NO_UNDO_LOG_FLAG
+				| BTR_NO_LOCKING_FLAG
+				| BTR_CREATE_FLAG,
+				&cursor, &offsets, &offsets_heap,
+				const_cast<dtuple_t*>(entry),
+				&rec, &big_rec, 0, NULL, &mtr);
+			ut_ad(!big_rec);
+			if (*error != DB_FAIL) {
+				break;
+			}
+
+			if (!has_index_lock) {
+				/* This needs a pessimistic operation.
+				Lock the index tree exclusively. */
+				mtr_commit(&mtr);
+				mtr_start(&mtr);
+				index->set_modified(mtr);
+				*error = cursor.search_leaf(entry, PAGE_CUR_LE,
+							    BTR_MODIFY_TREE,
+							    &mtr);
+				if (*error != DB_SUCCESS) {
+					break;
+				}
+			}
+
+			/* We already determined that the
+			record did not exist. No other thread
+			than the current one is allowed to
+			modify the index tree. Thus, the
+			record should still not exist. */
+
+			*error = btr_cur_pessimistic_insert(
+				BTR_NO_UNDO_LOG_FLAG
+				| BTR_NO_LOCKING_FLAG
+				| BTR_CREATE_FLAG,
+				&cursor, &offsets, &offsets_heap,
+				const_cast<dtuple_t*>(entry),
+				&rec, &big_rec,
+				0, NULL, &mtr);
+			ut_ad(!big_rec);
+			break;
+		}
+		mem_heap_empty(offsets_heap);
+	}
+
+	if (*error == DB_SUCCESS && trx_id) {
+		page_update_max_trx_id(btr_cur_get_block(&cursor),
+				       btr_cur_get_page_zip(&cursor),
+				       trx_id, &mtr);
+	}
+
+func_exit:
+	mtr_commit(&mtr);
+}
+
+/******************************************************//**
+Applies an operation to a secondary index that was being created.
+@return NULL on failure (mrec corruption) or when out of data;
+pointer to next record on success */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+const mrec_t*
+row_log_apply_op(
+/*=============*/
+	dict_index_t*	index,		/*!< in/out: index */
+	row_merge_dup_t*dup,		/*!< in/out: for reporting
+					duplicate key errors */
+	dberr_t*	error,		/*!< out: DB_SUCCESS or error code */
+	mem_heap_t*	offsets_heap,	/*!< in/out: memory heap for
+					allocating offsets; can be emptied */
+	mem_heap_t*	heap,		/*!< in/out: memory heap for
+					allocating data tuples */
+	bool		has_index_lock, /*!< in: true if holding index->lock
+					in exclusive mode */
+	const mrec_t*	mrec,		/*!< in: merge record */
+	const mrec_t*	mrec_end,	/*!< in: end of buffer */
+	rec_offs*	offsets)	/*!< in/out: work area for
+					rec_init_offsets_temp() */
+
+{
+	enum row_op	op;
+	ulint		extra_size;
+	ulint		data_size;
+	dtuple_t*	entry;
+	trx_id_t	trx_id;
+
+	/* Online index creation is only used for secondary indexes. */
+	ut_ad(!dict_index_is_clust(index));
+
+	ut_ad(index->lock.have_x() == has_index_lock);
+
+	if (index->is_corrupted()) {
+		*error = DB_INDEX_CORRUPT;
+		return(NULL);
+	}
+
+	*error = DB_SUCCESS;
+
+	if (mrec + ROW_LOG_HEADER_SIZE >= mrec_end) {
+		return(NULL);
+	}
+
+	switch (*mrec) {
+	case ROW_OP_INSERT:
+		if (ROW_LOG_HEADER_SIZE + DATA_TRX_ID_LEN + mrec >= mrec_end) {
+			return(NULL);
+		}
+
+		op = static_cast<enum row_op>(*mrec++);
+		trx_id = trx_read_trx_id(mrec);
+		mrec += DATA_TRX_ID_LEN;
+		break;
+	case ROW_OP_DELETE:
+		op = static_cast<enum row_op>(*mrec++);
+		trx_id = 0;
+		break;
+	default:
+corrupted:
+		ut_ad(0);
+		*error = DB_CORRUPTION;
+		return(NULL);
+	}
+
+	extra_size = *mrec++;
+
+	ut_ad(mrec < mrec_end);
+
+	if (extra_size >= 0x80) {
+		/* Read another byte of extra_size. */
+
+		extra_size = (extra_size & 0x7f) << 8;
+		extra_size |= *mrec++;
+	}
+
+	mrec += extra_size;
+
+	if (mrec > mrec_end) {
+		return(NULL);
+	}
+
+	rec_init_offsets_temp(mrec, index, offsets);
+
+	if (rec_offs_any_extern(offsets)) {
+		/* There should never be any externally stored fields
+		in a secondary index, which is what online index
+		creation is used for. Therefore, the log file must be
+		corrupted. */
+		goto corrupted;
+	}
+
+	data_size = rec_offs_data_size(offsets);
+
+	mrec += data_size;
+
+	if (mrec > mrec_end) {
+		return(NULL);
+	}
+
+	entry = row_rec_to_index_entry_low(
+		mrec - data_size, index, offsets, heap);
+	/* Online index creation is only implemented for secondary
+	indexes, which never contain off-page columns. */
+	ut_ad(dtuple_get_n_ext(entry) == 0);
+
+	row_log_apply_op_low(index, dup, error, offsets_heap,
+			     has_index_lock, op, trx_id, entry);
+	return(mrec);
+}
+
+/** Applies operations to a secondary index that was being created.
+@param[in]	trx	transaction (for checking if the operation was
+interrupted)
+@param[in,out]	index	index
+@param[in,out]	dup	for reporting duplicate key errors
+@param[in,out]	stage	performance schema accounting object, used by
+ALTER TABLE. If not NULL, then stage->inc() will be called for each block
+of log that is applied or nullptr when row log applied done by DML
+thread.
+@return DB_SUCCESS, or error code on failure */
+static
+dberr_t
+row_log_apply_ops(
+	const trx_t*		trx,
+	dict_index_t*		index,
+	row_merge_dup_t*	dup,
+	ut_stage_alter_t*	stage)
+{
+	dberr_t		error;
+	const mrec_t*	mrec	= NULL;
+	const mrec_t*	next_mrec;
+	const mrec_t*	mrec_end= NULL; /* silence bogus warning */
+	const mrec_t*	next_mrec_end;
+	mem_heap_t*	offsets_heap;
+	mem_heap_t*	heap;
+	rec_offs*	offsets;
+	bool		has_index_lock;
+	const ulint	i	= 1 + REC_OFFS_HEADER_SIZE
+		+ dict_index_get_n_fields(index);
+
+	ut_ad(dict_index_is_online_ddl(index)
+	      || (index->online_log
+		  && index->online_status == ONLINE_INDEX_COMPLETE));
+	ut_ad(!index->is_committed());
+	ut_ad(index->lock.have_x());
+	ut_ad(index->online_log);
+
+	MEM_UNDEFINED(&mrec_end, sizeof mrec_end);
+
+	offsets = static_cast<rec_offs*>(ut_malloc_nokey(i * sizeof *offsets));
+	rec_offs_set_n_alloc(offsets, i);
+	rec_offs_set_n_fields(offsets, dict_index_get_n_fields(index));
+
+	offsets_heap = mem_heap_create(srv_page_size);
+	heap = mem_heap_create(srv_page_size);
+	has_index_lock = true;
+
+next_block:
+	ut_ad(has_index_lock);
+	ut_ad(index->lock.have_x());
+	ut_ad(index->online_log->head.bytes == 0);
+
+	if (stage) {
+		stage->inc(row_log_progress_inc_per_block());
+	}
+
+	if (trx_is_interrupted(trx)) {
+		goto interrupted;
+	}
+
+	error = index->online_log->error;
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	if (index->is_corrupted()) {
+		error = DB_INDEX_CORRUPT;
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(index->online_log->head.blocks
+			  > index->online_log->tail.blocks)) {
+unexpected_eof:
+		ib::error() << "Unexpected end of temporary file for index "
+			<< index->name;
+corruption:
+		error = DB_CORRUPTION;
+		goto func_exit;
+	}
+
+	if (index->online_log->head.blocks
+	    == index->online_log->tail.blocks) {
+		if (index->online_log->head.blocks) {
+#ifdef HAVE_FTRUNCATE
+			/* Truncate the file in order to save space. */
+			if (index->online_log->fd > 0
+			    && ftruncate(index->online_log->fd, 0) == -1) {
+				ib::error()
+					<< "\'" << index->name + 1
+					<< "\' failed with error "
+					<< errno << ":" << strerror(errno);
+
+				goto corruption;
+			}
+#endif /* HAVE_FTRUNCATE */
+			index->online_log->head.blocks
+				= index->online_log->tail.blocks = 0;
+		}
+
+		next_mrec = index->online_log->tail.block;
+		next_mrec_end = next_mrec + index->online_log->tail.bytes;
+
+		if (next_mrec_end == next_mrec) {
+			/* End of log reached. */
+all_done:
+			ut_ad(has_index_lock);
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->tail.blocks == 0);
+			index->online_log->tail.bytes = 0;
+			index->online_log->head.bytes = 0;
+			error = DB_SUCCESS;
+			goto func_exit;
+		}
+	} else {
+		os_offset_t	ofs = static_cast<os_offset_t>(
+			index->online_log->head.blocks)
+			* srv_sort_buf_size;
+		ut_ad(has_index_lock);
+		has_index_lock = false;
+		index->lock.x_unlock();
+
+		log_free_check();
+
+		if (!row_log_block_allocate(index->online_log->head)) {
+			error = DB_OUT_OF_MEMORY;
+			goto func_exit;
+		}
+
+		byte*	buf = index->online_log->head.block;
+
+		if (DB_SUCCESS
+		    != os_file_read(IORequestRead, index->online_log->fd,
+				    buf, ofs, srv_sort_buf_size, nullptr)) {
+			ib::error()
+				<< "Unable to read temporary file"
+				" for index " << index->name;
+			goto corruption;
+		}
+
+		if (srv_encrypt_log) {
+			if (!log_tmp_block_decrypt(
+				    buf, srv_sort_buf_size,
+				    index->online_log->crypt_head, ofs)) {
+				error = DB_DECRYPTION_FAILED;
+				goto func_exit;
+			}
+
+			srv_stats.n_rowlog_blocks_decrypted.inc();
+			memcpy(buf, index->online_log->crypt_head, srv_sort_buf_size);
+		}
+
+#ifdef POSIX_FADV_DONTNEED
+		/* Each block is read exactly once.  Free up the file cache. */
+		posix_fadvise(index->online_log->fd,
+			      ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+		next_mrec = index->online_log->head.block;
+		next_mrec_end = next_mrec + srv_sort_buf_size;
+	}
+
+	if (mrec) {
+		/* A partial record was read from the previous block.
+		Copy the temporary buffer full, as we do not know the
+		length of the record. Parse subsequent records from
+		the bigger buffer index->online_log->head.block
+		or index->online_log->tail.block. */
+
+		ut_ad(mrec == index->online_log->head.buf);
+		ut_ad(mrec_end > mrec);
+		ut_ad(mrec_end < (&index->online_log->head.buf)[1]);
+
+		memcpy((mrec_t*) mrec_end, next_mrec,
+		       ulint((&index->online_log->head.buf)[1] - mrec_end));
+		mrec = row_log_apply_op(
+			index, dup, &error, offsets_heap, heap,
+			has_index_lock, index->online_log->head.buf,
+			(&index->online_log->head.buf)[1], offsets);
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		} else if (UNIV_UNLIKELY(mrec == NULL)) {
+			/* The record was not reassembled properly. */
+			goto corruption;
+		}
+		/* The record was previously found out to be
+		truncated. Now that the parse buffer was extended,
+		it should proceed beyond the old end of the buffer. */
+		ut_a(mrec > mrec_end);
+
+		index->online_log->head.bytes = ulint(mrec - mrec_end);
+		next_mrec += index->online_log->head.bytes;
+	}
+
+	ut_ad(next_mrec <= next_mrec_end);
+	/* The following loop must not be parsing the temporary
+	buffer, but head.block or tail.block. */
+
+	/* mrec!=NULL means that the next record starts from the
+	middle of the block */
+	ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0));
+
+#ifdef UNIV_DEBUG
+	if (next_mrec_end == index->online_log->head.block
+	    + srv_sort_buf_size) {
+		/* If tail.bytes == 0, next_mrec_end can also be at
+		the end of tail.block. */
+		if (index->online_log->tail.bytes == 0) {
+			ut_ad(next_mrec == next_mrec_end);
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->head.bytes == 0);
+		} else {
+			ut_ad(next_mrec == index->online_log->head.block
+			      + index->online_log->head.bytes);
+			ut_ad(index->online_log->tail.blocks
+			      > index->online_log->head.blocks);
+		}
+	} else if (next_mrec_end == index->online_log->tail.block
+		   + index->online_log->tail.bytes) {
+		ut_ad(next_mrec == index->online_log->tail.block
+		      + index->online_log->head.bytes);
+		ut_ad(index->online_log->tail.blocks == 0);
+		ut_ad(index->online_log->head.blocks == 0);
+		ut_ad(index->online_log->head.bytes
+		      <= index->online_log->tail.bytes);
+	} else {
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
+
+	mrec_end = next_mrec_end;
+
+	while (!trx_is_interrupted(trx)) {
+		mrec = next_mrec;
+		ut_ad(mrec < mrec_end);
+
+		if (!has_index_lock) {
+			/* We are applying operations from a different
+			block than the one that is being written to.
+			We do not hold index->lock in order to
+			allow other threads to concurrently buffer
+			modifications. */
+			ut_ad(mrec >= index->online_log->head.block);
+			ut_ad(mrec_end == index->online_log->head.block
+			      + srv_sort_buf_size);
+			ut_ad(index->online_log->head.bytes
+			      < srv_sort_buf_size);
+
+			/* Take the opportunity to do a redo log
+			checkpoint if needed. */
+			log_free_check();
+		} else {
+			/* We are applying operations from the last block.
+			Do not allow other threads to buffer anything,
+			so that we can finally catch up and synchronize. */
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(mrec_end == index->online_log->tail.block
+			      + index->online_log->tail.bytes);
+			ut_ad(mrec >= index->online_log->tail.block);
+		}
+
+		next_mrec = row_log_apply_op(
+			index, dup, &error, offsets_heap, heap,
+			has_index_lock, mrec, mrec_end, offsets);
+
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		} else if (next_mrec == next_mrec_end) {
+			/* The record happened to end on a block boundary.
+			Do we have more blocks left? */
+			if (has_index_lock) {
+				/* The index will be locked while
+				applying the last block. */
+				goto all_done;
+			}
+
+			mrec = NULL;
+process_next_block:
+			index->lock.x_lock(SRW_LOCK_CALL);
+			has_index_lock = true;
+
+			index->online_log->head.bytes = 0;
+			index->online_log->head.blocks++;
+			goto next_block;
+		} else if (next_mrec != NULL) {
+			ut_ad(next_mrec < next_mrec_end);
+			index->online_log->head.bytes
+				+= ulint(next_mrec - mrec);
+		} else if (has_index_lock) {
+			/* When mrec is within tail.block, it should
+			be a complete record, because we are holding
+			index->lock and thus excluding the writer. */
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(mrec_end == index->online_log->tail.block
+			      + index->online_log->tail.bytes);
+			ut_ad(0);
+			goto unexpected_eof;
+		} else {
+			memcpy(index->online_log->head.buf, mrec,
+			       ulint(mrec_end - mrec));
+			mrec_end += ulint(index->online_log->head.buf - mrec);
+			mrec = index->online_log->head.buf;
+			goto process_next_block;
+		}
+	}
+
+interrupted:
+	error = DB_INTERRUPTED;
+func_exit:
+	if (!has_index_lock) {
+		index->lock.x_lock(SRW_LOCK_CALL);
+	}
+
+	switch (error) {
+	case DB_SUCCESS:
+		break;
+	case DB_INDEX_CORRUPT:
+		if (((os_offset_t) index->online_log->tail.blocks + 1)
+		    * srv_sort_buf_size >= srv_online_max_size) {
+			/* The log file grew too big. */
+			error = DB_ONLINE_LOG_TOO_BIG;
+		}
+		/* fall through */
+	default:
+		index->type |= DICT_CORRUPT;
+	}
+
+	mem_heap_free(heap);
+	mem_heap_free(offsets_heap);
+	row_log_block_free(index->online_log->head);
+	ut_free(offsets);
+	return(error);
+}
+
+/** Apply the row log to the index upon completing index creation.
+@param[in]	trx	transaction (for checking if the operation was
+interrupted)
+@param[in,out]	index	secondary index
+@param[in,out]	table	MySQL table (for reporting duplicates)
+@param[in,out]	stage	performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_log_index() will be called initially and then
+stage->inc() will be called for each block of log that is applied or nullptr
+when row log has been applied by DML thread.
+@return DB_SUCCESS, or error code on failure */
+dberr_t
+row_log_apply(
+	const trx_t*		trx,
+	dict_index_t*		index,
+	struct TABLE*		table,
+	ut_stage_alter_t*	stage)
+{
+	dberr_t		error;
+	row_merge_dup_t	dup = { index, table, NULL, 0 };
+	DBUG_ENTER("row_log_apply");
+
+	ut_ad(dict_index_is_online_ddl(index)
+	      || (index->online_log
+		  && index->online_status == ONLINE_INDEX_COMPLETE));
+	ut_ad(!dict_index_is_clust(index));
+
+	if (stage) {
+		stage->begin_phase_log_index();
+	}
+
+	log_free_check();
+
+	index->lock.x_lock(SRW_LOCK_CALL);
+
+	if (index->online_log && !index->table->corrupted) {
+		error = row_log_apply_ops(trx, index, &dup, stage);
+	} else {
+		error = DB_SUCCESS;
+	}
+
+	if (error != DB_SUCCESS) {
+		ut_ad(index->table->space);
+		index->type |= DICT_CORRUPT;
+		index->table->drop_aborted = TRUE;
+
+		dict_index_set_online_status(index, ONLINE_INDEX_ABORTED);
+	} else if (stage) {
+		/* Mark the index as completed only when it is
+		being called by DDL thread */
+		ut_ad(dup.n_dup == 0);
+		dict_index_set_online_status(index, ONLINE_INDEX_COMPLETE);
+	}
+
+	index->lock.x_unlock();
+
+	DBUG_RETURN(error);
+}
+
+unsigned row_log_get_n_core_fields(const dict_index_t *index)
+{
+  ut_ad(index->online_log);
+  return index->online_log->n_core_fields;
+}
+
+dberr_t row_log_get_error(const dict_index_t *index)
+{
+  ut_ad(index->online_log);
+  return index->online_log->error;
+}
+
+dberr_t dict_table_t::clear(que_thr_t *thr)
+{
+  dberr_t err= DB_SUCCESS;
+  for (dict_index_t *index= UT_LIST_GET_FIRST(indexes); index;
+       index= UT_LIST_GET_NEXT(indexes, index))
+  {
+    if (index->type & DICT_FTS)
+      continue;
+
+    switch (dict_index_get_online_status(index)) {
+    case ONLINE_INDEX_ABORTED:
+    case ONLINE_INDEX_ABORTED_DROPPED:
+      continue;
+    case ONLINE_INDEX_COMPLETE:
+      break;
+    case ONLINE_INDEX_CREATION:
+      ut_ad("invalid type" == 0);
+      MY_ASSERT_UNREACHABLE();
+      break;
+    }
+    if (dberr_t err_index= index->clear(thr))
+      err= err_index;
+  }
+  return err;
+}
+
+inline bool UndorecApplier::is_same(roll_ptr_t roll_ptr) const
+{
+  return uint16_t(roll_ptr) == offset &&
+    uint32_t(roll_ptr >> 16) == page_id.page_no();
+}
+
+const rec_t *
+UndorecApplier::get_old_rec(const dtuple_t &tuple, dict_index_t *index,
+                            const rec_t **clust_rec, rec_offs **offsets)
+{
+  ut_ad(index->is_primary());
+  btr_pcur_t pcur;
+
+  bool found= row_search_on_row_ref(&pcur, BTR_MODIFY_LEAF,
+                                    index->table, &tuple, &mtr);
+  ut_a(found);
+  *clust_rec= btr_pcur_get_rec(&pcur);
+
+  ulint len= 0;
+  rec_t *prev_version;
+  const rec_t *version= *clust_rec;
+  do
+  {
+    *offsets= rec_get_offsets(version, index, *offsets,
+                              index->n_core_fields, ULINT_UNDEFINED,
+                              &heap);
+    roll_ptr_t roll_ptr= trx_read_roll_ptr(
+      rec_get_nth_field(version, *offsets, index->db_roll_ptr(), &len));
+    ut_ad(len == DATA_ROLL_PTR_LEN);
+    if (is_same(roll_ptr))
+      return version;
+    trx_undo_prev_version_build(version, index, *offsets, heap, &prev_version,
+                                nullptr, nullptr, 0);
+    version= prev_version;
+  }
+  while (version);
+
+  return nullptr;
+}
+
+/** Clear out all online log of other online indexes after
+encountering the error during row_log_apply() in DML thread
+@param	table	table which does online DDL */
+static void row_log_mark_other_online_index_abort(dict_table_t *table)
+{
+  dict_index_t *clust_index= dict_table_get_first_index(table);
+  for (dict_index_t *index= dict_table_get_next_index(clust_index);
+       index; index= dict_table_get_next_index(index))
+  {
+    if (index->online_log &&
+        index->online_status <= ONLINE_INDEX_CREATION &&
+        !index->is_corrupted())
+    {
+      index->lock.x_lock(SRW_LOCK_CALL);
+      row_log_abort_sec(index);
+      index->type|= DICT_CORRUPT;
+      index->lock.x_unlock();
+      MONITOR_ATOMIC_INC(MONITOR_BACKGROUND_DROP_INDEX);
+    }
+  }
+
+  clust_index->lock.x_lock(SRW_LOCK_CALL);
+  clust_index->online_log= nullptr;
+  clust_index->lock.x_unlock();
+  table->drop_aborted= TRUE;
+}
+
+void dtype_t::assign(const dict_col_t &col)
+{
+  prtype= col.prtype;
+  mtype= col.mtype;
+  len= col.len;
+  mbminlen= col.mbminlen;
+  mbmaxlen= col.mbmaxlen;
+}
+
+inline void dtuple_t::copy_field_types(const dict_index_t &index)
+{
+  ut_ad(index.n_fields == n_fields);
+  if (UNIV_LIKELY_NULL(index.change_col_info))
+    for (ulint i= 0; i < n_fields; i++)
+      fields[i].type.assign(*index.fields[i].col);
+}
+
+void UndorecApplier::log_insert(const dtuple_t &tuple,
+                                dict_index_t *clust_index)
+{
+  DEBUG_SYNC_C("row_log_insert_handle");
+  ut_ad(clust_index->is_primary());
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *offsets= offsets_;
+
+  rec_offs_init(offsets_);
+  mtr.start();
+  const rec_t *rec;
+  const rec_t *match_rec= get_old_rec(tuple, clust_index, &rec, &offsets);
+  if (!match_rec)
+  {
+    mtr.commit();
+    return;
+  }
+  const rec_t *copy_rec= match_rec;
+  if (match_rec == rec)
+  {
+    copy_rec= rec_copy(mem_heap_alloc(
+      heap, rec_offs_size(offsets)), match_rec, offsets);
+    rec_offs_make_valid(copy_rec, clust_index, true, offsets);
+  }
+  mtr.commit();
+
+  dict_table_t *table= clust_index->table;
+  clust_index->lock.s_lock(SRW_LOCK_CALL);
+  if (clust_index->online_log &&
+      !clust_index->online_log_is_dummy() &&
+      clust_index->online_status <= ONLINE_INDEX_CREATION)
+  {
+    row_log_table_insert(copy_rec, clust_index, offsets);
+    clust_index->lock.s_unlock();
+  }
+  else
+  {
+    clust_index->lock.s_unlock();
+    row_ext_t *ext;
+    dtuple_t *row= row_build(ROW_COPY_POINTERS, clust_index,
+      copy_rec, offsets, table, nullptr, nullptr, &ext, heap);
+
+    if (table->n_v_cols)
+    {
+      /* Update the row with virtual column values present
+      in the undo log or update vector */
+      if (type == TRX_UNDO_UPD_DEL_REC)
+        row_upd_replace_vcol(row, table, update, false, nullptr,
+                             (cmpl_info & UPD_NODE_NO_ORD_CHANGE)
+                             ? nullptr : undo_rec);
+      else
+        trx_undo_read_v_cols(table, undo_rec, row, false);
+    }
+
+    bool success= true;
+    for (dict_index_t *index= clust_index;
+         (index= dict_table_get_next_index(index)) != nullptr; )
+    {
+      index->lock.s_lock(SRW_LOCK_CALL);
+      if (index->online_log &&
+          index->online_status <= ONLINE_INDEX_CREATION &&
+          !index->is_corrupted())
+      {
+        dtuple_t *entry= row_build_index_entry_low(row, ext, index,
+                                                   heap, ROW_BUILD_NORMAL);
+        entry->copy_field_types(*index);
+	success= row_log_online_op(index, entry, trx_id);
+      }
+
+      index->lock.s_unlock();
+      if (!success)
+      {
+        row_log_mark_other_online_index_abort(index->table);
+        return;
+      }
+    }
+  }
+}
+
+void UndorecApplier::log_update(const dtuple_t &tuple,
+                                dict_index_t *clust_index)
+{
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs offsets2_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *offsets= offsets_;
+  rec_offs *prev_offsets= offsets2_;
+
+  rec_offs_init(offsets_);
+  rec_offs_init(offsets2_);
+
+  dict_table_t *table= clust_index->table;
+
+  clust_index->lock.s_lock(SRW_LOCK_CALL);
+  bool table_rebuild=
+    (clust_index->online_log
+     && !clust_index->online_log_is_dummy()
+     && clust_index->online_status <= ONLINE_INDEX_CREATION);
+  clust_index->lock.s_unlock();
+
+  mtr.start();
+  const rec_t *rec;
+  rec_t *prev_version;
+  bool is_update= (type == TRX_UNDO_UPD_EXIST_REC);
+  const rec_t *match_rec= get_old_rec(tuple, clust_index, &rec, &offsets);
+  if (!match_rec)
+  {
+    mtr.commit();
+    return;
+  }
+
+  if (table_rebuild)
+  {
+    const rec_t *copy_rec= match_rec;
+    if (match_rec == rec)
+      copy_rec= rec_copy(mem_heap_alloc(
+        heap, rec_offs_size(offsets)), match_rec, offsets);
+    trx_undo_prev_version_build(match_rec, clust_index, offsets, heap,
+                                &prev_version, nullptr, nullptr, 0);
+
+    prev_offsets= rec_get_offsets(prev_version, clust_index, prev_offsets,
+                                  clust_index->n_core_fields,
+                                  ULINT_UNDEFINED, &heap);
+    rec_offs_make_valid(copy_rec, clust_index, true, offsets);
+    mtr.commit();
+
+    clust_index->lock.s_lock(SRW_LOCK_CALL);
+    /* Recheck whether clustered index online log has been cleared */
+    if (clust_index->online_log)
+    {
+      if (is_update)
+      {
+        const dtuple_t *rebuilt_old_pk= row_log_table_get_pk(
+          prev_version, clust_index, prev_offsets, nullptr, &heap);
+        row_log_table_update(copy_rec, clust_index, offsets, rebuilt_old_pk);
+      }
+      else
+        row_log_table_delete(prev_version, clust_index, prev_offsets, nullptr);
+    }
+    clust_index->lock.s_unlock();
+    return;
+  }
+
+  dtuple_t *row= nullptr;
+  row_ext_t *new_ext;
+  if (match_rec != rec)
+    row= row_build(ROW_COPY_POINTERS, clust_index, match_rec, offsets,
+                   clust_index->table, NULL, NULL, &new_ext, heap);
+  else
+    row= row_build(ROW_COPY_DATA, clust_index, rec, offsets,
+                   clust_index->table, NULL, NULL, &new_ext, heap);
+  mtr.commit();
+  row_ext_t *old_ext;
+  dtuple_t *old_row= nullptr;
+  if (!(this->cmpl_info & UPD_NODE_NO_ORD_CHANGE))
+  {
+    for (ulint i = 0; i < dict_table_get_n_v_cols(table); i++)
+       dfield_get_type(
+         dtuple_get_nth_v_field(row, i))->mtype = DATA_MISSING;
+  }
+
+  if (is_update)
+  {
+    old_row= dtuple_copy(row, heap);
+    row_upd_replace(old_row, &old_ext, clust_index, update, heap);
+  }
+
+  if (table->n_v_cols)
+    row_upd_replace_vcol(row, table, update, false, nullptr,
+                         (cmpl_info & UPD_NODE_NO_ORD_CHANGE)
+                         ? nullptr : undo_rec);
+
+  bool success= true;
+  dict_index_t *index= dict_table_get_next_index(clust_index);
+  while (index)
+  {
+    index->lock.s_lock(SRW_LOCK_CALL);
+    if (index->online_log &&
+        index->online_status <= ONLINE_INDEX_CREATION &&
+        !index->is_corrupted())
+    {
+      if (is_update)
+      {
+        /* Ignore the index if the update doesn't affect the index */
+        if (!row_upd_changes_ord_field_binary(index, update,
+                                              nullptr,
+                                              row, new_ext))
+          goto next_index;
+        dtuple_t *old_entry= row_build_index_entry_low(
+          old_row, old_ext, index, heap, ROW_BUILD_NORMAL);
+
+        old_entry->copy_field_types(*index);
+
+	success= row_log_online_op(index, old_entry, 0);
+
+	dtuple_t *new_entry= row_build_index_entry_low(
+          row, new_ext, index, heap, ROW_BUILD_NORMAL);
+
+        new_entry->copy_field_types(*index);
+
+	if (success)
+	  success= row_log_online_op(index, new_entry, trx_id);
+      }
+      else
+      {
+        dtuple_t *old_entry= row_build_index_entry_low(
+          row, new_ext, index, heap, ROW_BUILD_NORMAL);
+
+        old_entry->copy_field_types(*index);
+
+        success= row_log_online_op(index, old_entry, 0);
+      }
+    }
+next_index:
+    index->lock.s_unlock();
+    if (!success)
+    {
+      row_log_mark_other_online_index_abort(index->table);
+      return;
+    }
+    index= dict_table_get_next_index(index);
+  }
+}
+
diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc
new file mode 100644
index 00000000..5df93fe6
--- /dev/null
+++ b/storage/innobase/row/row0merge.cc
@@ -0,0 +1,5406 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0merge.cc
+New index creation routines using a merge sort
+
+Created 12/4/2005 Jan Lindstrom
+Completed by Sunny Bains and Marko Makela
+*******************************************************/
+#include <my_global.h>
+#include <log.h>
+#include <sql_class.h>
+#include <math.h>
+
+#include "row0merge.h"
+#include "row0ext.h"
+#include "row0log.h"
+#include "row0ins.h"
+#include "row0row.h"
+#include "row0sel.h"
+#include "log0crypt.h"
+#include "dict0crea.h"
+#include "trx0purge.h"
+#include "lock0lock.h"
+#include "pars0pars.h"
+#include "ut0sort.h"
+#include "row0ftsort.h"
+#include "row0import.h"
+#include "row0vers.h"
+#include "handler0alter.h"
+#include "btr0bulk.h"
+#ifdef BTR_CUR_ADAPT
+# include "btr0sea.h"
+#endif /* BTR_CUR_ADAPT */
+#include "ut0stage.h"
+#include "fil0crypt.h"
+#include "srv0mon.h"
+
+/* Ignore posix_fadvise() on those platforms where it does not exist */
+#if defined _WIN32
+# define posix_fadvise(fd, offset, len, advice) /* nothing */
+#endif /* _WIN32 */
+
+/* Whether to disable file system cache */
+char	srv_disable_sort_file_cache;
+
+/** Class that caches spatial index row tuples made from a single cluster
+index page scan, and then insert into corresponding index tree */
+class spatial_index_info {
+public:
+  /** constructor
+  @param index	spatial index to be created */
+  spatial_index_info(dict_index_t *index) : index(index)
+  {
+    ut_ad(index->is_spatial());
+  }
+
+  /** Caches an index row into index tuple vector
+  @param[in]	row	table row
+  @param[in]	ext	externally stored column prefixes, or NULL */
+  void add(const dtuple_t *row, const row_ext_t *ext, mem_heap_t *heap)
+  {
+    dtuple_t *dtuple= row_build_index_entry(row, ext, index, heap);
+    ut_ad(dtuple);
+    ut_ad(dtuple->n_fields == index->n_fields);
+    if (ext)
+    {
+      /* Replace any references to ext, because ext will be allocated
+      from row_heap. */
+      for (ulint i= 1; i < dtuple->n_fields; i++)
+      {
+        dfield_t &dfield= dtuple->fields[i];
+        if (dfield.data >= ext->buf &&
+            dfield.data <= &ext->buf[ext->n_ext * ext->max_len])
+          dfield_dup(&dfield, heap);
+      }
+    }
+    m_dtuple_vec.push_back(dtuple);
+  }
+
+	/** Insert spatial index rows cached in vector into spatial index
+	@param[in]	trx_id		transaction id
+	@param[in]	pcur		cluster index scanning cursor
+	@param[in,out]	mtr_started	whether scan_mtr is active
+	@param[in,out]	heap		temporary memory heap
+	@param[in,out]	scan_mtr	mini-transaction for pcur
+	@return DB_SUCCESS if successful, else error number */
+	dberr_t insert(trx_id_t trx_id, btr_pcur_t* pcur,
+		       bool& mtr_started, mem_heap_t* heap, mtr_t* scan_mtr)
+	{
+		big_rec_t*      big_rec;
+		rec_t*          rec;
+		btr_cur_t       ins_cur;
+		mtr_t           mtr;
+		rtr_info_t      rtr_info;
+		rec_offs*	ins_offsets = NULL;
+		dberr_t		error = DB_SUCCESS;
+		dtuple_t*	dtuple;
+		const ulint	flag = BTR_NO_UNDO_LOG_FLAG
+				       | BTR_NO_LOCKING_FLAG
+				       | BTR_KEEP_SYS_FLAG | BTR_CREATE_FLAG;
+
+		ut_ad(mtr_started == scan_mtr->is_active());
+
+		DBUG_EXECUTE_IF("row_merge_instrument_log_check_flush",
+				log_sys.set_check_flush_or_checkpoint(););
+
+		for (idx_tuple_vec::iterator it = m_dtuple_vec.begin();
+		     it != m_dtuple_vec.end();
+		     ++it) {
+			dtuple = *it;
+			ut_ad(dtuple);
+
+			if (log_sys.check_flush_or_checkpoint()) {
+				if (mtr_started) {
+					if (!btr_pcur_move_to_prev_on_page(pcur)) {
+						error = DB_CORRUPTION;
+						break;
+					}
+					btr_pcur_store_position(pcur, scan_mtr);
+					scan_mtr->commit();
+					mtr_started = false;
+				}
+
+				log_free_check();
+			}
+
+			mtr.start();
+			index->set_modified(mtr);
+
+			ins_cur.page_cur.index = index;
+			rtr_init_rtr_info(&rtr_info, false, &ins_cur, index,
+					  false);
+			rtr_info_update_btr(&ins_cur, &rtr_info);
+
+			error = rtr_insert_leaf(&ins_cur, dtuple,
+						BTR_MODIFY_LEAF, &mtr);
+
+			/* It need to update MBR in parent entry,
+			so change search mode to BTR_MODIFY_TREE */
+			if (error == DB_SUCCESS && rtr_info.mbr_adj) {
+				mtr.commit();
+				rtr_clean_rtr_info(&rtr_info, true);
+				rtr_init_rtr_info(&rtr_info, false, &ins_cur,
+						  index, false);
+				rtr_info_update_btr(&ins_cur, &rtr_info);
+				mtr.start();
+				index->set_modified(mtr);
+				error = rtr_insert_leaf(&ins_cur, dtuple,
+							BTR_MODIFY_TREE, &mtr);
+			}
+
+			if (error == DB_SUCCESS) {
+				error = btr_cur_optimistic_insert(
+					flag, &ins_cur, &ins_offsets,
+					&heap, dtuple, &rec, &big_rec,
+					0, NULL, &mtr);
+			}
+
+			ut_ad(!big_rec);
+
+			if (error == DB_FAIL) {
+				mtr.commit();
+				mtr.start();
+				index->set_modified(mtr);
+
+				rtr_clean_rtr_info(&rtr_info, true);
+				rtr_init_rtr_info(&rtr_info, false,
+						  &ins_cur, index, false);
+
+				rtr_info_update_btr(&ins_cur, &rtr_info);
+				error = rtr_insert_leaf(&ins_cur, dtuple,
+							BTR_MODIFY_TREE, &mtr);
+
+				if (error == DB_SUCCESS) {
+					error = btr_cur_pessimistic_insert(
+						flag, &ins_cur, &ins_offsets,
+						&heap, dtuple, &rec,
+						&big_rec, 0, NULL, &mtr);
+				}
+			}
+
+			ut_ad(!big_rec);
+
+			DBUG_EXECUTE_IF(
+				"row_merge_ins_spatial_fail",
+				error = DB_FAIL;
+			);
+
+			if (error == DB_SUCCESS) {
+				if (rtr_info.mbr_adj) {
+					error = rtr_ins_enlarge_mbr(
+							&ins_cur, &mtr);
+				}
+
+				if (error == DB_SUCCESS) {
+					page_update_max_trx_id(
+						btr_cur_get_block(&ins_cur),
+						btr_cur_get_page_zip(&ins_cur),
+						trx_id, &mtr);
+				}
+			}
+
+			mtr.commit();
+
+			rtr_clean_rtr_info(&rtr_info, true);
+		}
+
+		m_dtuple_vec.clear();
+
+		return(error);
+	}
+
+private:
+  /** Cache index rows made from a cluster index scan. Usually
+  for rows on single cluster index page */
+  typedef std::vector<dtuple_t*, ut_allocator<dtuple_t*> > idx_tuple_vec;
+
+  /** vector used to cache index rows made from cluster index scan */
+  idx_tuple_vec m_dtuple_vec;
+public:
+  /** the index being built */
+  dict_index_t*const	index;
+};
+
+/* Maximum pending doc memory limit in bytes for a fts tokenization thread */
+#define FTS_PENDING_DOC_MEMORY_LIMIT	1000000
+
+/** Insert sorted data tuples to the index.
+@param[in]	index		index to be inserted
+@param[in]	old_table	old table
+@param[in]	fd		file descriptor
+@param[in,out]	block		file buffer
+@param[in]	row_buf		row_buf the sorted data tuples,
+or NULL if fd, block will be used instead
+@param[in,out]	btr_bulk	btr bulk instance
+@param[in]	table_total_rows total rows of old table
+@param[in]	pct_progress	total progress percent untill now
+@param[in]	pct_cost	current progress percent
+@param[in]	crypt_block	buffer for encryption or NULL
+@param[in]	space		space id
+@param[in,out]	stage		performance schema accounting object, used by
+ALTER TABLE. If not NULL stage->begin_phase_insert() will be called initially
+and then stage->inc() will be called for each record that is processed.
+@param[in]	blob_file	To read big column field data from
+				the given blob file. It is
+				applicable only for bulk insert
+				operation
+@return DB_SUCCESS or error number */
+static	MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_merge_insert_index_tuples(
+	dict_index_t*		index,
+	const dict_table_t*	old_table,
+	const pfs_os_file_t&	fd,
+	row_merge_block_t*	block,
+	const row_merge_buf_t*	row_buf,
+	BtrBulk*		btr_bulk,
+	const ib_uint64_t	table_total_rows,
+	double			pct_progress,
+	double			pct_cost,
+	row_merge_block_t*	crypt_block,
+	ulint			space,
+	ut_stage_alter_t*	stage= nullptr,
+	merge_file_t*		blob_file= nullptr);
+
+/** Encode an index record.
+@return size of the record */
+static MY_ATTRIBUTE((nonnull))
+ulint
+row_merge_buf_encode(
+/*=================*/
+	byte**			b,		/*!< in/out: pointer to
+						current end of output buffer */
+	const dict_index_t*	index,		/*!< in: index */
+	const mtuple_t*		entry,		/*!< in: index fields
+						of the record to encode */
+	ulint			n_fields)	/*!< in: number of fields
+						in the entry */
+{
+	ulint	size;
+	ulint	extra_size;
+
+	size = rec_get_converted_size_temp<false>(
+		index, entry->fields, n_fields, &extra_size);
+	ut_ad(size >= extra_size);
+
+	/* Encode extra_size + 1 */
+	if (extra_size + 1 < 0x80) {
+		*(*b)++ = (byte) (extra_size + 1);
+	} else {
+		ut_ad((extra_size + 1) < 0x8000);
+		*(*b)++ = (byte) (0x80 | ((extra_size + 1) >> 8));
+		*(*b)++ = (byte) (extra_size + 1);
+	}
+
+	rec_convert_dtuple_to_temp<false>(*b + extra_size, index,
+				   entry->fields, n_fields);
+
+	*b += size;
+	return size;
+}
+
+static MY_ATTRIBUTE((malloc, nonnull))
+row_merge_buf_t*
+row_merge_buf_create_low(
+  row_merge_buf_t *buf, mem_heap_t *heap, dict_index_t *index)
+{
+  ulint max_tuples = srv_sort_buf_size
+                     / std::max<ulint>(1, dict_index_get_min_size(index));
+  ut_ad(max_tuples > 0);
+  ut_ad(max_tuples <= srv_sort_buf_size);
+
+  buf->heap = heap;
+  buf->index = index;
+  buf->max_tuples = max_tuples;
+  buf->tuples = static_cast<mtuple_t*>(
+   ut_malloc_nokey(2 * max_tuples * sizeof *buf->tuples));
+  buf->tmp_tuples = buf->tuples + max_tuples;
+  return(buf);
+}
+
+/******************************************************//**
+Allocate a sort buffer.
+@return own: sort buffer */
+row_merge_buf_t*
+row_merge_buf_create(
+/*=================*/
+	dict_index_t*	index)	/*!< in: secondary index */
+{
+	row_merge_buf_t*	buf;
+	ulint			buf_size;
+	mem_heap_t*		heap;
+
+	buf_size = (sizeof *buf);
+
+	heap = mem_heap_create(buf_size);
+
+	buf = static_cast<row_merge_buf_t*>(
+		mem_heap_zalloc(heap, buf_size));
+	row_merge_buf_create_low(buf, heap, index);
+
+	return(buf);
+}
+
+/******************************************************//**
+Empty a sort buffer.
+@return sort buffer */
+row_merge_buf_t*
+row_merge_buf_empty(
+/*================*/
+	row_merge_buf_t*	buf)	/*!< in,own: sort buffer */
+{
+	ulint		buf_size	= sizeof *buf;
+	ulint		max_tuples	= buf->max_tuples;
+	mem_heap_t*	heap		= buf->heap;
+	dict_index_t*	index		= buf->index;
+	mtuple_t*	tuples		= buf->tuples;
+
+	mem_heap_empty(heap);
+
+	buf = static_cast<row_merge_buf_t*>(mem_heap_zalloc(heap, buf_size));
+	buf->heap = heap;
+	buf->index = index;
+	buf->max_tuples = max_tuples;
+	buf->tuples = tuples;
+	buf->tmp_tuples = buf->tuples + max_tuples;
+
+	return(buf);
+}
+
+/******************************************************//**
+Deallocate a sort buffer. */
+void
+row_merge_buf_free(
+/*===============*/
+	row_merge_buf_t*	buf)	/*!< in,own: sort buffer to be freed */
+{
+	ut_free(buf->tuples);
+	mem_heap_free(buf->heap);
+}
+
+/** Convert the field data from compact to redundant format.
+@param[in]	row_field	field to copy from
+@param[out]	field		field to copy to
+@param[in]	len		length of the field data
+@param[in]	zip_size	compressed BLOB page size,
+				zero for uncompressed BLOBs
+@param[in,out]	heap		memory heap where to allocate data when
+				converting to ROW_FORMAT=REDUNDANT, or NULL
+				when not to invoke
+				row_merge_buf_redundant_convert(). */
+static
+void
+row_merge_buf_redundant_convert(
+	const dfield_t*		row_field,
+	dfield_t*		field,
+	ulint			len,
+	ulint			zip_size,
+	mem_heap_t*		heap)
+{
+	ut_ad(field->type.mbminlen == 1);
+	ut_ad(field->type.mbmaxlen > 1);
+
+	byte*		buf = (byte*) mem_heap_alloc(heap, len);
+	ulint		field_len = row_field->len;
+	ut_ad(field_len <= len);
+
+	if (row_field->ext) {
+		const byte*	field_data = static_cast<const byte*>(
+			dfield_get_data(row_field));
+		ulint		ext_len;
+
+		ut_a(field_len >= BTR_EXTERN_FIELD_REF_SIZE);
+		ut_a(memcmp(field_data + field_len - BTR_EXTERN_FIELD_REF_SIZE,
+			    field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
+
+		byte*	data = btr_copy_externally_stored_field(
+			&ext_len, field_data, zip_size, field_len, heap);
+
+		ut_ad(ext_len < len);
+
+		memcpy(buf, data, ext_len);
+		field_len = ext_len;
+	} else {
+		memcpy(buf, row_field->data, field_len);
+	}
+
+	memset(buf + field_len, 0x20, len - field_len);
+
+	dfield_set_data(field, buf, len);
+}
+
+/** Insert the tuple into bulk buffer insert operation
+@param	buf	merge buffer for the index operation
+@param	table	bulk insert operation for the table
+@param	row	tuple to be inserted
+@return number of rows inserted */
+static ulint row_merge_bulk_buf_add(row_merge_buf_t* buf,
+                                    const dict_table_t &table,
+                                    const dtuple_t &row)
+{
+  if (buf->n_tuples >= buf->max_tuples)
+    return 0;
+
+  const dict_index_t *index= buf->index;
+  ulint n_fields= dict_index_get_n_fields(index);
+  mtuple_t *entry= &buf->tuples[buf->n_tuples];
+  ulint data_size= 0;
+  ulint extra_size= UT_BITS_IN_BYTES(unsigned(index->n_nullable));
+  dfield_t *field= entry->fields= static_cast<dfield_t*>(
+     mem_heap_alloc(buf->heap, n_fields * sizeof *entry->fields));
+  const dict_field_t *ifield= dict_index_get_nth_field(index, 0);
+
+  for (ulint i = 0; i < n_fields; i++, field++, ifield++)
+  {
+    dfield_copy(field, &row.fields[i]);
+    ulint len= dfield_get_len(field);
+    const dict_col_t* const col= ifield->col;
+
+    if (dfield_is_null(field))
+      continue;
+
+    ulint fixed_len= ifield->fixed_len;
+
+    /* CHAR in ROW_FORMAT=REDUNDANT is always
+    fixed-length, but in the temporary file it is
+    variable-length for variable-length character sets. */
+    if (fixed_len && !index->table->not_redundant() &&
+        col->mbminlen != col->mbmaxlen)
+      fixed_len= 0;
+
+    if (fixed_len);
+    else if (len < 128 || (!DATA_BIG_COL(col)))
+      extra_size++;
+    else
+      extra_size += 2;
+    data_size += len;
+  }
+
+  /* Add to the total size of the record in row_merge_block_t
+  the encoded length of extra_size and the extra bytes (extra_size).
+  See row_merge_buf_write() for the variable-length encoding
+  of extra_size. */
+  data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80);
+
+  /* Reserve bytes for the end marker of row_merge_block_t. */
+  if (buf->total_size + data_size >= srv_sort_buf_size)
+    return 0;
+
+  buf->total_size += data_size;
+  buf->n_tuples++;
+
+  field= entry->fields;
+
+  do
+    dfield_dup(field++, buf->heap);
+  while (--n_fields);
+
+  return 1;
+}
+
+/** Insert a data tuple into a sort buffer.
+@param[in,out]	buf		sort buffer
+@param[in]	fts_index	fts index to be created
+@param[in]	old_table	original table
+@param[in]	new_table	new table
+@param[in,out]	psort_info	parallel sort info
+@param[in,out]	row		table row
+@param[in]	ext		cache of externally stored
+				column prefixes, or NULL
+@param[in]	history_fts	row is historical in a system-versioned table
+				on which a FTS_DOC_ID_INDEX(FTS_DOC_ID) exists
+@param[in,out]	doc_id		Doc ID if we are creating
+				FTS index
+@param[in,out]	conv_heap	memory heap where to allocate data when
+				converting to ROW_FORMAT=REDUNDANT, or NULL
+				when not to invoke
+				row_merge_buf_redundant_convert()
+@param[in,out]	err		set if error occurs
+@param[in,out]	v_heap		heap memory to process data for virtual column
+@param[in,out]	my_table	mysql table object
+@param[in]	trx		transaction object
+@param[in]	col_collate	columns whose collations changed, or nullptr
+@return number of rows added, 0 if out of space */
+static
+ulint
+row_merge_buf_add(
+	row_merge_buf_t*	buf,
+	dict_index_t*		fts_index,
+	const dict_table_t*	old_table,
+	const dict_table_t*	new_table,
+	fts_psort_t*		psort_info,
+	dtuple_t*		row,
+	const row_ext_t*	ext,
+	const bool		history_fts,
+	doc_id_t*		doc_id,
+	mem_heap_t*		conv_heap,
+	dberr_t*		err,
+	mem_heap_t**		v_heap,
+	TABLE*			my_table,
+	trx_t*			trx,
+	const col_collations*	col_collate)
+{
+	ulint			i;
+	const dict_index_t*	index;
+	mtuple_t*		entry;
+	dfield_t*		field;
+	const dict_field_t*	ifield;
+	ulint			n_fields;
+	ulint			data_size;
+	ulint			extra_size;
+	ulint			bucket = 0;
+	doc_id_t		write_doc_id;
+	ulint			n_row_added = 0;
+	VCOL_STORAGE		vcol_storage;
+
+	DBUG_ENTER("row_merge_buf_add");
+
+	if (buf->n_tuples >= buf->max_tuples) {
+error:
+		n_row_added = 0;
+		goto end;
+	}
+
+	DBUG_EXECUTE_IF(
+		"ib_row_merge_buf_add_two",
+		if (buf->n_tuples >= 2) DBUG_RETURN(0););
+
+	UNIV_PREFETCH_R(row->fields);
+
+	/* If we are building FTS index, buf->index points to
+	the 'fts_sort_idx', and real FTS index is stored in
+	fts_index */
+	index = (buf->index->type & DICT_FTS) ? fts_index : buf->index;
+
+	/* create spatial index should not come here */
+	ut_ad(!dict_index_is_spatial(index));
+
+	n_fields = dict_index_get_n_fields(index);
+
+	entry = &buf->tuples[buf->n_tuples];
+	field = entry->fields = static_cast<dfield_t*>(
+		mem_heap_alloc(buf->heap, n_fields * sizeof *entry->fields));
+
+	data_size = 0;
+	extra_size = UT_BITS_IN_BYTES(unsigned(index->n_nullable));
+
+	ifield = dict_index_get_nth_field(index, 0);
+
+	for (i = 0; i < n_fields; i++, field++, ifield++) {
+		ulint			len;
+		ulint			fixed_len;
+		const dfield_t*		row_field;
+		const dict_col_t* const col = ifield->col;
+		const dict_v_col_t* const v_col = col->is_virtual()
+			? reinterpret_cast<const dict_v_col_t*>(col)
+			: NULL;
+
+		/* Process the Doc ID column */
+		if (!v_col && (history_fts || *doc_id)
+		    && col->ind == index->table->fts->doc_col) {
+			fts_write_doc_id((byte*) &write_doc_id, *doc_id);
+
+			/* Note: field->data now points to a value on the
+			stack: &write_doc_id after dfield_set_data(). Because
+			there is only one doc_id per row, it shouldn't matter.
+			We allocate a new buffer before we leave the function
+			later below. */
+
+			dfield_set_data(
+				field, &write_doc_id, sizeof(write_doc_id));
+
+			field->type.mtype = ifield->col->mtype;
+			field->type.prtype = ifield->col->prtype;
+			field->type.mbminlen = 0;
+			field->type.mbmaxlen = 0;
+			field->type.len = ifield->col->len;
+		} else {
+			/* Use callback to get the virtual column value */
+			if (v_col) {
+				dict_index_t*	clust_index
+					= dict_table_get_first_index(new_table);
+
+                                if (!vcol_storage.innobase_record &&
+                                    !innobase_allocate_row_for_vcol(
+						trx->mysql_thd, clust_index,
+						v_heap, &my_table,
+						&vcol_storage)) {
+					*err = DB_OUT_OF_MEMORY;
+					goto error;
+				}
+
+				row_field = innobase_get_computed_value(
+					row, v_col, clust_index,
+					v_heap, NULL, ifield, trx->mysql_thd,
+					my_table, vcol_storage.innobase_record,
+					old_table, NULL);
+
+				if (row_field == NULL) {
+					*err = DB_COMPUTE_VALUE_FAILED;
+					goto error;
+				}
+				dfield_copy(field, row_field);
+			} else {
+				row_field = dtuple_get_nth_field(row,
+								 col->ind);
+				dfield_copy(field, row_field);
+
+				/* Copy the column collation to the
+				tuple field */
+				if (col_collate) {
+					auto it = col_collate->find(col->ind);
+					if (it != col_collate->end()) {
+						field->type
+							.assign(*it->second);
+					}
+				}
+			}
+
+			/* Tokenize and process data for FTS */
+			if (!history_fts && (index->type & DICT_FTS)) {
+				fts_doc_item_t*	doc_item;
+				byte*		value;
+				void*		ptr;
+				const ulint	max_trial_count = 10000;
+				ulint		trial_count = 0;
+
+				/* fetch Doc ID if it already exists
+				in the row, and not supplied by the
+				caller. Even if the value column is
+				NULL, we still need to get the Doc
+				ID so to maintain the correct max
+				Doc ID */
+				if (*doc_id == 0) {
+					const dfield_t*	doc_field;
+					doc_field = dtuple_get_nth_field(
+						row,
+						index->table->fts->doc_col);
+					*doc_id = (doc_id_t) mach_read_from_8(
+						static_cast<const byte*>(
+						dfield_get_data(doc_field)));
+
+					if (*doc_id == 0) {
+						ib::warn() << "FTS Doc ID is"
+							" zero. Record"
+							" skipped";
+						goto error;
+					}
+				}
+
+				if (dfield_is_null(field)) {
+					n_row_added = 1;
+					continue;
+				}
+
+				ptr = ut_malloc_nokey(sizeof(*doc_item)
+						      + field->len);
+
+				doc_item = static_cast<fts_doc_item_t*>(ptr);
+				value = static_cast<byte*>(ptr)
+					+ sizeof(*doc_item);
+				memcpy(value, field->data, field->len);
+				field->data = value;
+
+				doc_item->field = field;
+				doc_item->doc_id = *doc_id;
+
+				bucket = static_cast<ulint>(
+					*doc_id % fts_sort_pll_degree);
+
+				/* Add doc item to fts_doc_list */
+				mysql_mutex_lock(&psort_info[bucket].mutex);
+
+				if (psort_info[bucket].error == DB_SUCCESS) {
+					UT_LIST_ADD_LAST(
+						psort_info[bucket].fts_doc_list,
+						doc_item);
+					psort_info[bucket].memory_used +=
+						sizeof(*doc_item) + field->len;
+				} else {
+					ut_free(doc_item);
+				}
+
+				mysql_mutex_unlock(&psort_info[bucket].mutex);
+
+				/* Sleep when memory used exceeds limit*/
+				while (psort_info[bucket].memory_used
+				       > FTS_PENDING_DOC_MEMORY_LIMIT
+				       && trial_count++ < max_trial_count) {
+					std::this_thread::sleep_for(
+						std::chrono::milliseconds(1));
+				}
+
+				n_row_added = 1;
+				continue;
+			}
+
+			/* innobase_get_computed_value() sets the
+			length of the virtual column field. */
+			if (v_col == NULL
+			    && field->len != UNIV_SQL_NULL
+			    && col->mtype == DATA_MYSQL
+			    && col->len != field->len) {
+				if (conv_heap != NULL) {
+					row_merge_buf_redundant_convert(
+						row_field, field, col->len,
+						old_table->space->zip_size(),
+						conv_heap);
+				}
+			}
+		}
+
+		len = dfield_get_len(field);
+
+		if (dfield_is_null(field)) {
+			ut_ad(!(col->prtype & DATA_NOT_NULL));
+			continue;
+		} else if (!ext) {
+		} else if (dict_index_is_clust(index)) {
+			/* Flag externally stored fields. */
+			const byte*	buf = row_ext_lookup(ext, col->ind,
+							     &len);
+			if (UNIV_LIKELY_NULL(buf)) {
+				ut_a(buf != field_ref_zero);
+				if (i < dict_index_get_n_unique(index)) {
+					dfield_set_data(field, buf, len);
+				} else {
+					dfield_set_ext(field);
+					len = dfield_get_len(field);
+				}
+			}
+		} else if (!v_col) {
+			/* Only non-virtual column are stored externally */
+			const byte*	buf = row_ext_lookup(ext, col->ind,
+							     &len);
+			if (UNIV_LIKELY_NULL(buf)) {
+				ut_a(buf != field_ref_zero);
+				dfield_set_data(field, buf, len);
+			}
+		}
+
+		/* If a column prefix index, take only the prefix */
+
+		if (ifield->prefix_len) {
+			len = dtype_get_at_most_n_mbchars(
+				col->prtype,
+				col->mbminlen, col->mbmaxlen,
+				ifield->prefix_len,
+				len,
+				static_cast<char*>(dfield_get_data(field)));
+			dfield_set_len(field, len);
+		}
+
+		ut_ad(len <= col->len
+		      || DATA_LARGE_MTYPE(col->mtype));
+
+		fixed_len = ifield->fixed_len;
+		if (fixed_len && !dict_table_is_comp(index->table)
+		    && col->mbminlen != col->mbmaxlen) {
+			/* CHAR in ROW_FORMAT=REDUNDANT is always
+			fixed-length, but in the temporary file it is
+			variable-length for variable-length character
+			sets. */
+			fixed_len = 0;
+		}
+
+		if (fixed_len) {
+#ifdef UNIV_DEBUG
+			/* len should be between size calcualted base on
+			mbmaxlen and mbminlen */
+			ut_ad(len <= fixed_len);
+			ut_ad(!col->mbmaxlen || len >= col->mbminlen
+			      * (fixed_len / col->mbmaxlen));
+
+			ut_ad(!dfield_is_ext(field));
+#endif /* UNIV_DEBUG */
+		} else if (dfield_is_ext(field)) {
+			extra_size += 2;
+		} else if (len < 128
+			   || (!DATA_BIG_COL(col))) {
+			extra_size++;
+		} else {
+			/* For variable-length columns, we look up the
+			maximum length from the column itself.  If this
+			is a prefix index column shorter than 256 bytes,
+			this will waste one byte. */
+			extra_size += 2;
+		}
+		data_size += len;
+	}
+
+	/* If this is FTS index, we already populated the sort buffer, return
+	here */
+	if (index->type & DICT_FTS) {
+		goto end;
+	}
+
+#ifdef UNIV_DEBUG
+	{
+		ulint	size;
+		ulint	extra;
+
+		size = rec_get_converted_size_temp<false>(
+			index, entry->fields, n_fields, &extra);
+
+		ut_ad(data_size + extra_size == size);
+		ut_ad(extra_size == extra);
+	}
+#endif /* UNIV_DEBUG */
+
+	/* Add to the total size of the record in row_merge_block_t
+	the encoded length of extra_size and the extra bytes (extra_size).
+	See row_merge_buf_write() for the variable-length encoding
+	of extra_size. */
+	data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80);
+
+	/* Record size can exceed page size while converting to
+	redundant row format. But there is assert
+	ut_ad(size < srv_page_size) in rec_offs_data_size().
+	It may hit the assert before attempting to insert the row. */
+	if (conv_heap != NULL && data_size > srv_page_size) {
+		*err = DB_TOO_BIG_RECORD;
+	}
+
+	ut_ad(data_size < srv_sort_buf_size);
+
+	/* Reserve bytes for the end marker of row_merge_block_t. */
+	if (buf->total_size + data_size >= srv_sort_buf_size) {
+		goto error;
+	}
+
+	buf->total_size += data_size;
+	buf->n_tuples++;
+	n_row_added++;
+
+	field = entry->fields;
+
+	/* Copy the data fields. */
+
+	do {
+		dfield_dup(field++, buf->heap);
+	} while (--n_fields);
+
+	if (conv_heap != NULL) {
+		mem_heap_empty(conv_heap);
+	}
+
+end:
+        if (vcol_storage.innobase_record)
+		innobase_free_row_for_vcol(&vcol_storage);
+	DBUG_RETURN(n_row_added);
+}
+
+/*************************************************************//**
+Report a duplicate key. */
+void
+row_merge_dup_report(
+/*=================*/
+	row_merge_dup_t*	dup,	/*!< in/out: for reporting duplicates */
+	const dfield_t*		entry)	/*!< in: duplicate index entry */
+{
+	if (!dup->n_dup++ && dup->table) {
+		/* Only report the first duplicate record,
+		but count all duplicate records. */
+		innobase_fields_to_mysql(dup->table, dup->index, entry);
+	}
+}
+
+/*************************************************************//**
+Compare two tuples.
+@return positive, 0, negative if a is greater, equal, less, than b,
+respectively */
+static MY_ATTRIBUTE((warn_unused_result))
+int
+row_merge_tuple_cmp(
+/*================*/
+	const dict_index_t*	index,	/*< in: index tree */
+	ulint			n_uniq,	/*!< in: number of unique fields */
+	ulint			n_field,/*!< in: number of fields */
+	const mtuple_t&		a,	/*!< in: first tuple to be compared */
+	const mtuple_t&		b,	/*!< in: second tuple to be compared */
+	row_merge_dup_t*	dup)	/*!< in/out: for reporting duplicates,
+					NULL if non-unique index */
+{
+	int		cmp;
+	const dfield_t*	af	= a.fields;
+	const dfield_t*	bf	= b.fields;
+	ulint		n	= n_uniq;
+	const dict_field_t* f = index->fields;
+
+	ut_ad(n_uniq > 0);
+	ut_ad(n_uniq <= n_field);
+
+	/* Compare the fields of the tuples until a difference is
+	found or we run out of fields to compare.  If !cmp at the
+	end, the tuples are equal. */
+	do {
+		cmp = cmp_dfield_dfield(af++, bf++, (f++)->descending);
+	} while (!cmp && --n);
+
+	if (cmp) {
+		return(cmp);
+	}
+
+	if (dup) {
+		/* Report a duplicate value error if the tuples are
+		logically equal.  NULL columns are logically inequal,
+		although they are equal in the sorting order.  Find
+		out if any of the fields are NULL. */
+		for (const dfield_t* df = a.fields; df != af; df++) {
+			if (dfield_is_null(df)) {
+				goto no_report;
+			}
+		}
+
+		row_merge_dup_report(dup, a.fields);
+	}
+
+no_report:
+	/* The n_uniq fields were equal, but we compare all fields so
+	that we will get the same (internal) order as in the B-tree. */
+	for (n = n_field - n_uniq + 1; --n; ) {
+		cmp = cmp_dfield_dfield(af++, bf++, (f++)->descending);
+		if (cmp) {
+			return(cmp);
+		}
+	}
+
+	/* This should never be reached, except in a secondary index
+	when creating a secondary index and a PRIMARY KEY, and there
+	is a duplicate in the PRIMARY KEY that has not been detected
+	yet. Internally, an index must never contain duplicates. */
+	return(cmp);
+}
+
+/** Wrapper for row_merge_tuple_sort() to inject some more context to
+UT_SORT_FUNCTION_BODY().
+@param tuples array of tuples that being sorted
+@param aux work area, same size as tuples[]
+@param low lower bound of the sorting area, inclusive
+@param high upper bound of the sorting area, inclusive */
+#define row_merge_tuple_sort_ctx(tuples, aux, low, high)		\
+	row_merge_tuple_sort(index,n_uniq,n_field,dup, tuples, aux, low, high)
+/** Wrapper for row_merge_tuple_cmp() to inject some more context to
+UT_SORT_FUNCTION_BODY().
+@param a first tuple to be compared
+@param b second tuple to be compared
+@return positive, 0, negative, if a is greater, equal, less, than b,
+respectively */
+#define row_merge_tuple_cmp_ctx(a,b)			\
+	row_merge_tuple_cmp(index, n_uniq, n_field, a, b, dup)
+
+/**********************************************************************//**
+Merge sort the tuple buffer in main memory. */
+static
+void
+row_merge_tuple_sort(
+/*=================*/
+	const dict_index_t*	index,	/*!< in: index tree */
+	ulint			n_uniq,	/*!< in: number of unique fields */
+	ulint			n_field,/*!< in: number of fields */
+	row_merge_dup_t*	dup,	/*!< in/out: reporter of duplicates
+					(NULL if non-unique index) */
+	mtuple_t*		tuples,	/*!< in/out: tuples */
+	mtuple_t*		aux,	/*!< in/out: work area */
+	ulint			low,	/*!< in: lower bound of the
+					sorting area, inclusive */
+	ulint			high)	/*!< in: upper bound of the
+					sorting area, exclusive */
+{
+	ut_ad(n_field > 0);
+	ut_ad(n_uniq <= n_field);
+
+	UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx,
+			      tuples, aux, low, high, row_merge_tuple_cmp_ctx);
+}
+
+/******************************************************//**
+Sort a buffer. */
+void
+row_merge_buf_sort(
+/*===============*/
+	row_merge_buf_t*	buf,	/*!< in/out: sort buffer */
+	row_merge_dup_t*	dup)	/*!< in/out: reporter of duplicates
+					(NULL if non-unique index) */
+{
+  ut_ad(!buf->index->is_spatial());
+  row_merge_tuple_sort(buf->index, buf->index->n_uniq, buf->index->n_fields,
+                       dup, buf->tuples, buf->tmp_tuples, 0, buf->n_tuples);
+}
+
+/** Write the blob field data to temporary file and fill the offset,
+length in the field data
+@param	field		tuple field
+@param	blob_file	file to store the blob data
+@param	heap		heap to store the blob offset and length
+@return DB_SUCCESS if successful */
+static dberr_t row_merge_write_blob_to_tmp_file(
+   dfield_t *field, merge_file_t *blob_file,mem_heap_t **heap)
+{
+  if (blob_file->fd == OS_FILE_CLOSED)
+  {
+    blob_file->fd= row_merge_file_create_low(nullptr);
+    if (blob_file->fd == OS_FILE_CLOSED)
+      return DB_OUT_OF_MEMORY;
+  }
+  uint64_t val= blob_file->offset;
+  uint32_t len= field->len;
+  dberr_t err= os_file_write(
+    IORequestWrite, "(bulk insert)", blob_file->fd,
+    field->data, blob_file->offset, len);
+
+  if (err != DB_SUCCESS)
+    return err;
+
+  byte *data= static_cast<byte*>
+    (mem_heap_alloc(*heap, BTR_EXTERN_FIELD_REF_SIZE));
+
+  /* Write zeroes for first 8 bytes */
+  memset(data, 0, 8);
+  /* Write offset for next 8 bytes */
+  mach_write_to_8(data + 8, val);
+  /* Write length of the blob in 4 bytes */
+  mach_write_to_4(data + 16, len);
+  blob_file->offset+= field->len;
+  blob_file->n_rec++;
+  dfield_set_data(field, data, BTR_EXTERN_FIELD_REF_SIZE);
+  dfield_set_ext(field);
+  return err;
+}
+
+/** This function is invoked when tuple size is greater than
+innodb_sort_buffer_size. Basically it recreates the tuple
+by writing the blob field to the temporary file.
+@param entry     index fields to be encode the blob
+@param blob_file file to store the blob data
+@param heap      heap to store the blob offset and blob length
+@return tuple which fits into sort_buffer_size */
+static dtuple_t* row_merge_buf_large_tuple(const dtuple_t &entry,
+                                           merge_file_t *blob_file,
+                                           mem_heap_t **heap)
+{
+  if (!*heap)
+    *heap= mem_heap_create(DTUPLE_EST_ALLOC(entry.n_fields));
+
+  dtuple_t *tuple= dtuple_copy(&entry, *heap);
+  for (ulint i= 0; i < tuple->n_fields; i++)
+  {
+    dfield_t *field= &tuple->fields[i];
+    if (dfield_is_null(field) || field->len <= 2000)
+      continue;
+
+    dberr_t err= row_merge_write_blob_to_tmp_file(field, blob_file, heap);
+    if (err != DB_SUCCESS)
+      return nullptr;
+  }
+
+  return tuple;
+}
+
+
+/** Write the field data whose length is more than 2000 bytes
+into blob temporary file and write offset, length into the
+tuple field
+@param entry     index fields to be encode the blob
+@param n_fields  number of fields in the entry
+@param heap      heap to store the blob offset and blob length
+@param blob_file file to store the blob data */
+static dberr_t row_merge_buf_blob(const mtuple_t *entry, ulint n_fields,
+                                  mem_heap_t **heap, merge_file_t *blob_file)
+{
+
+  if (!*heap)
+    *heap= mem_heap_create(100);
+
+  for (ulint i= 0; i < n_fields; i++)
+  {
+    dfield_t *field= &entry->fields[i];
+    if (dfield_is_null(field) || field->len <= 2000)
+      continue;
+
+    dberr_t err= row_merge_write_blob_to_tmp_file(field, blob_file, heap);
+    if (err != DB_SUCCESS)
+      return err;
+  }
+
+  return DB_SUCCESS;
+}
+
+/** Write a buffer to a block.
+@param buf              sorted buffer
+@param block            buffer for writing to file
+@param blob_file        blob file handle for doing bulk insert operation */
+dberr_t row_merge_buf_write(const row_merge_buf_t *buf,
+#ifndef DBUG_OFF
+                            const merge_file_t *of, /*!< output file */
+#endif
+                            row_merge_block_t *block,
+                            merge_file_t *blob_file)
+{
+	const dict_index_t*	index	= buf->index;
+	ulint			n_fields= dict_index_get_n_fields(index);
+	byte*			b	= &block[0];
+	mem_heap_t*		blob_heap = nullptr;
+	dberr_t			err = DB_SUCCESS;
+
+	DBUG_ENTER("row_merge_buf_write");
+
+	for (ulint i = 0; i < buf->n_tuples; i++) {
+		const mtuple_t*	entry	= &buf->tuples[i];
+
+		if (blob_file) {
+			ut_ad(buf->index->is_primary());
+			err = row_merge_buf_blob(
+				entry, n_fields, &blob_heap, blob_file);
+			if (err != DB_SUCCESS) {
+				goto func_exit;
+			}
+		}
+
+		ulint rec_size= row_merge_buf_encode(
+				&b, index, entry, n_fields);
+		if (blob_file && rec_size > srv_page_size) {
+			err = DB_TOO_BIG_RECORD;
+			goto func_exit;
+		}
+
+		ut_ad(b < &block[srv_sort_buf_size]);
+
+		DBUG_LOG("ib_merge_sort",
+			 reinterpret_cast<const void*>(b) << ','
+			 << of->fd << ',' << of->offset << ' ' <<
+			 i << ": " <<
+			 rec_printer(entry->fields, n_fields).str());
+	}
+
+	/* Write an "end-of-chunk" marker. */
+	ut_a(b < &block[srv_sort_buf_size]);
+	ut_a(b == &block[0] + buf->total_size || blob_file);
+	*b++ = 0;
+#ifdef HAVE_valgrind
+	/* The rest of the block is uninitialized.  Initialize it
+	to avoid bogus warnings. */
+	memset(b, 0xff, &block[srv_sort_buf_size] - b);
+#endif /* HAVE_valgrind */
+	DBUG_LOG("ib_merge_sort",
+		 "write " << reinterpret_cast<const void*>(b) << ','
+		 << of->fd << ',' << of->offset << " EOF");
+func_exit:
+	if (blob_heap) {
+		mem_heap_free(blob_heap);
+	}
+
+	DBUG_RETURN(err);
+}
+
+/******************************************************//**
+Create a memory heap and allocate space for row_merge_rec_offsets()
+and mrec_buf_t[3].
+@return memory heap */
+static
+mem_heap_t*
+row_merge_heap_create(
+/*==================*/
+	const dict_index_t*	index,		/*!< in: record descriptor */
+	mrec_buf_t**		buf,		/*!< out: 3 buffers */
+	rec_offs**		offsets1,	/*!< out: offsets */
+	rec_offs**		offsets2)	/*!< out: offsets */
+{
+	ulint		i	= 1 + REC_OFFS_HEADER_SIZE
+		+ dict_index_get_n_fields(index);
+	mem_heap_t*	heap	= mem_heap_create(2 * i * sizeof **offsets1
+						  + 3 * sizeof **buf);
+
+	*buf = static_cast<mrec_buf_t*>(
+		mem_heap_alloc(heap, 3 * sizeof **buf));
+	*offsets1 = static_cast<rec_offs*>(
+		mem_heap_alloc(heap, i * sizeof **offsets1));
+	*offsets2 = static_cast<rec_offs*>(
+		mem_heap_alloc(heap, i * sizeof **offsets2));
+
+	rec_offs_set_n_alloc(*offsets1, i);
+	rec_offs_set_n_alloc(*offsets2, i);
+	rec_offs_set_n_fields(*offsets1, dict_index_get_n_fields(index));
+	rec_offs_set_n_fields(*offsets2, dict_index_get_n_fields(index));
+
+	return(heap);
+}
+
+/** Read a merge block from the file system.
+@return whether the request was completed successfully */
+bool
+row_merge_read(
+/*===========*/
+	const pfs_os_file_t&	fd,	/*!< in: file descriptor */
+	ulint			offset,	/*!< in: offset where to read
+					in number of row_merge_block_t
+					elements */
+	row_merge_block_t*	buf,	/*!< out: data */
+	row_merge_block_t*	crypt_buf, /*!< in: crypt buf or NULL */
+	ulint			space)		/*!< in: space id */
+{
+	os_offset_t	ofs = ((os_offset_t) offset) * srv_sort_buf_size;
+
+	DBUG_ENTER("row_merge_read");
+	DBUG_LOG("ib_merge_sort", "fd=" << fd << " ofs=" << ofs);
+	DBUG_EXECUTE_IF("row_merge_read_failure", DBUG_RETURN(FALSE););
+
+	const dberr_t err = os_file_read(
+		IORequestRead, fd, buf, ofs, srv_sort_buf_size, nullptr);
+
+	/* If encryption is enabled decrypt buffer */
+	if (err == DB_SUCCESS && srv_encrypt_log) {
+		if (!log_tmp_block_decrypt(buf, srv_sort_buf_size,
+					   crypt_buf, ofs)) {
+			DBUG_RETURN(false);
+		}
+
+		srv_stats.n_merge_blocks_decrypted.inc();
+		memcpy(buf, crypt_buf, srv_sort_buf_size);
+	}
+
+#ifdef POSIX_FADV_DONTNEED
+	/* Each block is read exactly once.  Free up the file cache. */
+	posix_fadvise(fd, ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+	DBUG_RETURN(err == DB_SUCCESS);
+}
+
+/********************************************************************//**
+Write a merge block to the file system.
+@return whether the request was completed successfully
+@retval	false	on error
+@retval	true	on success */
+bool
+row_merge_write(
+	const pfs_os_file_t&	fd,			/*!< in: file descriptor */
+	ulint		offset,			/*!< in: offset where to write,
+						in number of row_merge_block_t elements */
+	const void*	buf,			/*!< in: data */
+	void*		crypt_buf,		/*!< in: crypt buf or NULL */
+	ulint		space)			/*!< in: space id */
+{
+	size_t		buf_len = srv_sort_buf_size;
+	os_offset_t	ofs = buf_len * (os_offset_t) offset;
+	void*		out_buf = (void *)buf;
+
+	DBUG_ENTER("row_merge_write");
+	DBUG_LOG("ib_merge_sort", "fd=" << fd << " ofs=" << ofs);
+	DBUG_EXECUTE_IF("row_merge_write_failure", DBUG_RETURN(FALSE););
+
+	/* For encrypted tables, encrypt data before writing */
+	if (srv_encrypt_log) {
+		if (!log_tmp_block_encrypt(static_cast<const byte*>(buf),
+					   buf_len,
+					   static_cast<byte*>(crypt_buf),
+					   ofs)) {
+			DBUG_RETURN(false);
+		}
+
+		srv_stats.n_merge_blocks_encrypted.inc();
+		out_buf = crypt_buf;
+	}
+
+	const bool	success = DB_SUCCESS == os_file_write(
+		IORequestWrite, "(merge)", fd, out_buf, ofs, buf_len);
+
+#ifdef POSIX_FADV_DONTNEED
+	/* The block will be needed on the next merge pass,
+	but it can be evicted from the file cache meanwhile. */
+	posix_fadvise(fd, ofs, buf_len, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+	DBUG_RETURN(success);
+}
+
+/********************************************************************//**
+Read a merge record.
+@return pointer to next record, or NULL on I/O error or end of list */
+const byte*
+row_merge_read_rec(
+/*===============*/
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	mrec_buf_t*		buf,	/*!< in/out: secondary buffer */
+	const byte*		b,	/*!< in: pointer to record */
+	const dict_index_t*	index,	/*!< in: index of the record */
+	const pfs_os_file_t&	fd,	/*!< in: file descriptor */
+	ulint*			foffs,	/*!< in/out: file offset */
+	const mrec_t**		mrec,	/*!< out: pointer to merge record,
+					or NULL on end of list
+					(non-NULL on I/O error) */
+	rec_offs*		offsets,/*!< out: offsets of mrec */
+	row_merge_block_t*	crypt_block, /*!< in: crypt buf or NULL */
+	ulint			space) /*!< in: space id */
+{
+	ulint	extra_size;
+	ulint	data_size;
+	ulint	avail_size;
+
+	ut_ad(b >= &block[0]);
+	ut_ad(b < &block[srv_sort_buf_size]);
+
+	ut_ad(rec_offs_get_n_alloc(offsets) == 1 + REC_OFFS_HEADER_SIZE
+	      + dict_index_get_n_fields(index));
+
+	DBUG_ENTER("row_merge_read_rec");
+
+	extra_size = *b++;
+
+	if (UNIV_UNLIKELY(!extra_size)) {
+		/* End of list */
+		*mrec = NULL;
+		DBUG_LOG("ib_merge_sort",
+			 "read " << reinterpret_cast<const void*>(b) << ',' <<
+			 reinterpret_cast<const void*>(block) << ',' <<
+			 fd << ',' << *foffs << " EOF");
+		DBUG_RETURN(NULL);
+	}
+
+	if (extra_size >= 0x80) {
+		/* Read another byte of extra_size. */
+
+		if (UNIV_UNLIKELY(b >= &block[srv_sort_buf_size])) {
+			if (!row_merge_read(fd, ++(*foffs), block,
+					    crypt_block,
+					    space)) {
+err_exit:
+				/* Signal I/O error. */
+				*mrec = b;
+				DBUG_RETURN(NULL);
+			}
+
+			/* Wrap around to the beginning of the buffer. */
+			b = &block[0];
+		}
+
+		extra_size = (extra_size & 0x7f) << 8;
+		extra_size |= *b++;
+	}
+
+	/* Normalize extra_size.  Above, value 0 signals "end of list". */
+	extra_size--;
+
+	/* Read the extra bytes. */
+
+	if (UNIV_UNLIKELY(b + extra_size >= &block[srv_sort_buf_size])) {
+		/* The record spans two blocks.  Copy the entire record
+		to the auxiliary buffer and handle this as a special
+		case. */
+
+		avail_size = ulint(&block[srv_sort_buf_size] - b);
+		ut_ad(avail_size < sizeof *buf);
+		memcpy(*buf, b, avail_size);
+
+		if (!row_merge_read(fd, ++(*foffs), block,
+				    crypt_block,
+				    space)) {
+
+			goto err_exit;
+		}
+
+		/* Wrap around to the beginning of the buffer. */
+		b = &block[0];
+
+		/* Copy the record. */
+		memcpy(*buf + avail_size, b, extra_size - avail_size);
+		b += extra_size - avail_size;
+
+		*mrec = *buf + extra_size;
+
+		rec_init_offsets_temp(*mrec, index, offsets);
+
+		data_size = rec_offs_data_size(offsets);
+
+		/* These overflows should be impossible given that
+		records are much smaller than either buffer, and
+		the record starts near the beginning of each buffer. */
+		ut_a(extra_size + data_size < sizeof *buf);
+		ut_a(b + data_size < &block[srv_sort_buf_size]);
+
+		/* Copy the data bytes. */
+		memcpy(*buf + extra_size, b, data_size);
+		b += data_size;
+
+		goto func_exit;
+	}
+
+	*mrec = b + extra_size;
+
+	rec_init_offsets_temp(*mrec, index, offsets);
+
+	data_size = rec_offs_data_size(offsets);
+	ut_ad(extra_size + data_size < sizeof *buf);
+
+	b += extra_size + data_size;
+
+	if (UNIV_LIKELY(b < &block[srv_sort_buf_size])) {
+		/* The record fits entirely in the block.
+		This is the normal case. */
+		goto func_exit;
+	}
+
+	/* The record spans two blocks.  Copy it to buf. */
+
+	b -= extra_size + data_size;
+	avail_size = ulint(&block[srv_sort_buf_size] - b);
+	memcpy(*buf, b, avail_size);
+	*mrec = *buf + extra_size;
+
+	rec_init_offsets_temp(*mrec, index, offsets);
+
+	if (!row_merge_read(fd, ++(*foffs), block,
+			    crypt_block,
+			    space)) {
+
+		goto err_exit;
+	}
+
+	/* Wrap around to the beginning of the buffer. */
+	b = &block[0];
+
+	/* Copy the rest of the record. */
+	memcpy(*buf + avail_size, b, extra_size + data_size - avail_size);
+	b += extra_size + data_size - avail_size;
+
+func_exit:
+	DBUG_LOG("ib_merge_sort",
+		 reinterpret_cast<const void*>(b) << ',' <<
+		 reinterpret_cast<const void*>(block)
+		 << ",fd=" << fd << ',' << *foffs << ": "
+		 << rec_printer(*mrec, 0, offsets).str());
+	DBUG_RETURN(b);
+}
+
+/********************************************************************//**
+Write a merge record. */
+static
+void
+row_merge_write_rec_low(
+/*====================*/
+	byte*		b,	/*!< out: buffer */
+	ulint		e,	/*!< in: encoded extra_size */
+#ifndef DBUG_OFF
+	ulint		size,	/*!< in: total size to write */
+	const pfs_os_file_t&	fd,	/*!< in: file descriptor */
+	ulint		foffs,	/*!< in: file offset */
+#endif /* !DBUG_OFF */
+	const mrec_t*	mrec,	/*!< in: record to write */
+	const rec_offs*	offsets)/*!< in: offsets of mrec */
+#ifdef DBUG_OFF
+# define row_merge_write_rec_low(b, e, size, fd, foffs, mrec, offsets)	\
+	row_merge_write_rec_low(b, e, mrec, offsets)
+#endif /* DBUG_OFF */
+{
+	DBUG_ENTER("row_merge_write_rec_low");
+
+#ifndef DBUG_OFF
+	const byte* const end = b + size;
+#endif /* DBUG_OFF */
+	DBUG_ASSERT(e == rec_offs_extra_size(offsets) + 1);
+
+	DBUG_LOG("ib_merge_sort",
+		 reinterpret_cast<const void*>(b) << ",fd=" << fd << ','
+		 << foffs << ": " << rec_printer(mrec, 0, offsets).str());
+
+	if (e < 0x80) {
+		*b++ = (byte) e;
+	} else {
+		*b++ = (byte) (0x80 | (e >> 8));
+		*b++ = (byte) e;
+	}
+
+	memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets));
+	DBUG_SLOW_ASSERT(b + rec_offs_size(offsets) == end);
+	DBUG_VOID_RETURN;
+}
+
+/********************************************************************//**
+Write a merge record.
+@return pointer to end of block, or NULL on error */
+static
+byte*
+row_merge_write_rec(
+/*================*/
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	mrec_buf_t*		buf,	/*!< in/out: secondary buffer */
+	byte*			b,	/*!< in: pointer to end of block */
+	const pfs_os_file_t&	fd,	/*!< in: file descriptor */
+	ulint*			foffs,	/*!< in/out: file offset */
+	const mrec_t*		mrec,	/*!< in: record to write */
+	const rec_offs*         offsets,/*!< in: offsets of mrec */
+	row_merge_block_t*	crypt_block, /*!< in: crypt buf or NULL */
+	ulint			space)	   /*!< in: space id */
+{
+	ulint	extra_size;
+	ulint	size;
+	ulint	avail_size;
+
+	ut_ad(block);
+	ut_ad(buf);
+	ut_ad(b >= &block[0]);
+	ut_ad(b < &block[srv_sort_buf_size]);
+	ut_ad(mrec);
+	ut_ad(foffs);
+	ut_ad(mrec < &block[0] || mrec > &block[srv_sort_buf_size]);
+	ut_ad(mrec < buf[0] || mrec > buf[1]);
+
+	/* Normalize extra_size.  Value 0 signals "end of list". */
+	extra_size = rec_offs_extra_size(offsets) + 1;
+
+	size = extra_size + (extra_size >= 0x80)
+		+ rec_offs_data_size(offsets);
+
+	if (UNIV_UNLIKELY(b + size >= &block[srv_sort_buf_size])) {
+		/* The record spans two blocks.
+		Copy it to the temporary buffer first. */
+		avail_size = ulint(&block[srv_sort_buf_size] - b);
+
+		row_merge_write_rec_low(buf[0],
+					extra_size, size, fd, *foffs,
+					mrec, offsets);
+
+		/* Copy the head of the temporary buffer, write
+		the completed block, and copy the tail of the
+		record to the head of the new block. */
+		memcpy(b, buf[0], avail_size);
+
+		if (!row_merge_write(fd, (*foffs)++, block,
+				     crypt_block,
+				     space)) {
+			return(NULL);
+		}
+
+		MEM_UNDEFINED(&block[0], srv_sort_buf_size);
+
+		/* Copy the rest. */
+		b = &block[0];
+		memcpy(b, buf[0] + avail_size, size - avail_size);
+		b += size - avail_size;
+	} else {
+		row_merge_write_rec_low(b, extra_size, size, fd, *foffs,
+					mrec, offsets);
+		b += size;
+	}
+
+	return(b);
+}
+
+/********************************************************************//**
+Write an end-of-list marker.
+@return pointer to end of block, or NULL on error */
+static
+byte*
+row_merge_write_eof(
+/*================*/
+	row_merge_block_t*	block,		/*!< in/out: file buffer */
+	byte*			b,		/*!< in: pointer to end of block */
+	const pfs_os_file_t&	fd,		/*!< in: file descriptor */
+	ulint*			foffs,		/*!< in/out: file offset */
+	row_merge_block_t*	crypt_block, 	/*!< in: crypt buf or NULL */
+	ulint			space)	   	/*!< in: space id */
+{
+	ut_ad(block);
+	ut_ad(b >= &block[0]);
+	ut_ad(b < &block[srv_sort_buf_size]);
+	ut_ad(foffs);
+
+	DBUG_ENTER("row_merge_write_eof");
+	DBUG_LOG("ib_merge_sort",
+		 reinterpret_cast<const void*>(b) << ',' <<
+		 reinterpret_cast<const void*>(block) <<
+		 ",fd=" << fd << ',' << *foffs);
+
+	*b++ = 0;
+	MEM_CHECK_DEFINED(&block[0], b - &block[0]);
+	MEM_CHECK_ADDRESSABLE(&block[0], srv_sort_buf_size);
+
+	/* The rest of the block is uninitialized. Silence warnings. */
+	MEM_MAKE_DEFINED(b, &block[srv_sort_buf_size] - b);
+
+	if (!row_merge_write(fd, (*foffs)++, block, crypt_block, space)) {
+		DBUG_RETURN(NULL);
+	}
+
+	MEM_UNDEFINED(&block[0], srv_sort_buf_size);
+	DBUG_RETURN(&block[0]);
+}
+
+/** Create a temporary file if it has not been created already.
+@param[in,out]	tmpfd	temporary file handle
+@param[in]	path	location for creating temporary file
+@return true on success, false on error */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+row_merge_tmpfile_if_needed(
+	pfs_os_file_t*		tmpfd,
+	const char*	path)
+{
+	if (*tmpfd == OS_FILE_CLOSED) {
+		*tmpfd = row_merge_file_create_low(path);
+		if (*tmpfd != OS_FILE_CLOSED) {
+			MONITOR_ATOMIC_INC(MONITOR_ALTER_TABLE_SORT_FILES);
+		}
+	}
+
+	return(*tmpfd != OS_FILE_CLOSED);
+}
+
+/** Create a temporary file for merge sort if it was not created already.
+@param[in,out]	file	merge file structure
+@param[in]	nrec	number of records in the file
+@param[in]	path	location for creating temporary file
+@return  true on success, false on error */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+row_merge_file_create_if_needed(
+	merge_file_t*	file,
+	pfs_os_file_t*	tmpfd,
+	ulint		nrec,
+	const char*	path)
+{
+	ut_ad(file->fd == OS_FILE_CLOSED || *tmpfd != OS_FILE_CLOSED);
+	if (file->fd == OS_FILE_CLOSED && row_merge_file_create(file, path)!= OS_FILE_CLOSED) {
+		MONITOR_ATOMIC_INC(MONITOR_ALTER_TABLE_SORT_FILES);
+		if (!row_merge_tmpfile_if_needed(tmpfd, path) ) {
+			return(false);
+		}
+
+		file->n_rec = nrec;
+	}
+
+	ut_ad(file->fd == OS_FILE_CLOSED || *tmpfd != OS_FILE_CLOSED);
+	return(file->fd != OS_FILE_CLOSED);
+}
+
+/** Copy the merge data tuple from another merge data tuple.
+@param[in]	mtuple		source merge data tuple
+@param[in,out]	prev_mtuple	destination merge data tuple
+@param[in]	n_unique	number of unique fields exist in the mtuple
+@param[in,out]	heap		memory heap where last_mtuple allocated */
+static
+void
+row_mtuple_create(
+	const mtuple_t*	mtuple,
+	mtuple_t*	prev_mtuple,
+	ulint		n_unique,
+	mem_heap_t*	heap)
+{
+	memcpy(prev_mtuple->fields, mtuple->fields,
+	       n_unique * sizeof *mtuple->fields);
+
+	dfield_t*	field = prev_mtuple->fields;
+
+	for (ulint i = 0; i < n_unique; i++) {
+		dfield_dup(field++, heap);
+	}
+}
+
+/** Compare two merge data tuples.
+@param[in]	prev_mtuple	merge data tuple
+@param[in]	current_mtuple	merge data tuple
+@param[in,out]	dup		reporter of duplicates
+@retval positive, 0, negative if current_mtuple is greater, equal, less, than
+last_mtuple. */
+static
+int
+row_mtuple_cmp(
+	const mtuple_t*		prev_mtuple,
+	const mtuple_t*		current_mtuple,
+	row_merge_dup_t*	dup)
+{
+  ut_ad(dup->index->is_primary());
+  const ulint n_uniq= dup->index->n_uniq;
+  return row_merge_tuple_cmp(dup->index, n_uniq, n_uniq,
+                             *current_mtuple, *prev_mtuple, dup);
+}
+
+/** Insert cached spatial index rows.
+@param[in]	trx_id		transaction id
+@param[in]	sp_tuples	cached spatial rows
+@param[in]	num_spatial	number of spatial indexes
+@param[in,out]	heap		temporary memory heap
+@param[in,out]	pcur		cluster index cursor
+@param[in,out]	started		whether mtr is active
+@param[in,out]	mtr		mini-transaction
+@return DB_SUCCESS or error number */
+static
+dberr_t
+row_merge_spatial_rows(
+	trx_id_t		trx_id,
+	spatial_index_info**	sp_tuples,
+	ulint			num_spatial,
+	mem_heap_t*		heap,
+	btr_pcur_t*		pcur,
+	bool&			started,
+	mtr_t*			mtr)
+{
+  if (!sp_tuples)
+    return DB_SUCCESS;
+
+  for (ulint j= 0; j < num_spatial; j++)
+    if (dberr_t err= sp_tuples[j]->insert(trx_id, pcur, started, heap, mtr))
+      return err;
+
+  mem_heap_empty(heap);
+  return DB_SUCCESS;
+}
+
+/** Check if the geometry field is valid.
+@param[in]	row		the row
+@param[in]	index		spatial index
+@return true if it's valid, false if it's invalid. */
+static
+bool
+row_geo_field_is_valid(
+	const dtuple_t*		row,
+	dict_index_t*		index)
+{
+	const dict_field_t*	ind_field
+		= dict_index_get_nth_field(index, 0);
+	const dict_col_t*	col
+		= ind_field->col;
+	ulint			col_no
+		= dict_col_get_no(col);
+	const dfield_t*		dfield
+		= dtuple_get_nth_field(row, col_no);
+
+	if (dfield_is_null(dfield)
+	    || dfield_get_len(dfield) < GEO_DATA_HEADER_SIZE) {
+		return(false);
+	}
+
+	return(true);
+}
+
+/** Reads clustered index of the table and create temporary files
+containing the index entries for the indexes to be built.
+@param[in]	trx		transaction
+@param[in,out]	table		MySQL table object, for reporting erroneous
+				records
+@param[in]	old_table	table where rows are read from
+@param[in]	new_table	table where indexes are created; identical to
+				old_table unless creating a PRIMARY KEY
+@param[in]	online		true if creating indexes online
+@param[in]	index		indexes to be created
+@param[in]	fts_sort_idx	full-text index to be created, or NULL
+@param[in]	psort_info	parallel sort info for fts_sort_idx creation,
+				or NULL
+@param[in]	files		temporary files
+@param[in]	key_numbers	MySQL key numbers to create
+@param[in]	n_index		number of indexes to create
+@param[in]	defaults	default values of added, changed columns, or NULL
+@param[in]	add_v		newly added virtual columns along with indexes
+@param[in]	col_map		mapping of old column numbers to new ones, or
+NULL if old_table == new_table
+@param[in]	add_autoinc	number of added AUTO_INCREMENT columns, or
+ULINT_UNDEFINED if none is added
+@param[in,out]	sequence	autoinc sequence
+@param[in,out]	block		file buffer
+@param[in]	skip_pk_sort	whether the new PRIMARY KEY will follow
+existing order
+@param[in,out]	tmpfd		temporary file handle
+@param[in,out]	stage		performance schema accounting object, used by
+ALTER TABLE. stage->n_pk_recs_inc() will be called for each record read and
+stage->inc() will be called for each page read.
+@param[in]	pct_cost	percent of task weight out of total alter job
+@param[in,out]	crypt_block	crypted file buffer
+@param[in]	eval_table	mysql table used to evaluate virtual column
+				value, see innobase_get_computed_value().
+@param[in]	allow_not_null	allow null to not-null conversion
+@param[in]	col_collate	columns whose collations changed, or nullptr
+@return DB_SUCCESS or error */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_merge_read_clustered_index(
+	trx_t*			trx,
+	struct TABLE*		table,
+	const dict_table_t*	old_table,
+	dict_table_t*		new_table,
+	bool			online,
+	dict_index_t**		index,
+	dict_index_t*		fts_sort_idx,
+	fts_psort_t*		psort_info,
+	merge_file_t*		files,
+	const ulint*		key_numbers,
+	ulint			n_index,
+	const dtuple_t*		defaults,
+	const dict_add_v_col_t*	add_v,
+	const ulint*		col_map,
+	ulint			add_autoinc,
+	ib_sequence_t&		sequence,
+	row_merge_block_t*	block,
+	bool			skip_pk_sort,
+	pfs_os_file_t*		tmpfd,
+	ut_stage_alter_t*	stage,
+	double 			pct_cost,
+	row_merge_block_t*	crypt_block,
+	struct TABLE*		eval_table,
+	bool			allow_not_null,
+	const col_collations*	col_collate)
+{
+	dict_index_t*		clust_index;	/* Clustered index */
+	mem_heap_t*		row_heap = NULL;/* Heap memory to create
+						clustered index tuples */
+	row_merge_buf_t**	merge_buf;	/* Temporary list for records*/
+	mem_heap_t*		v_heap = NULL;	/* Heap memory to process large
+						data for virtual column */
+	btr_pcur_t		pcur;		/* Cursor on the clustered
+						index */
+	mtr_t			mtr;		/* Mini transaction */
+	bool			mtr_started = false;
+	dberr_t			err = DB_SUCCESS;/* Return code */
+	ulint			n_nonnull = 0;	/* number of columns
+						changed to NOT NULL */
+	ulint*			nonnull = NULL;	/* NOT NULL columns */
+	dict_index_t*		fts_index = NULL;/* FTS index */
+	doc_id_t		doc_id = 0;
+	doc_id_t		max_doc_id = 0;
+	ibool			add_doc_id = FALSE;
+	pthread_cond_t*		fts_parallel_sort_cond = nullptr;
+	spatial_index_info**	sp_tuples = nullptr;
+	ulint			num_spatial = 0;
+	BtrBulk*		clust_btr_bulk = NULL;
+	bool			clust_temp_file = false;
+	mem_heap_t*		mtuple_heap = NULL;
+	mtuple_t		prev_mtuple;
+	mem_heap_t*		conv_heap = NULL;
+	double 			curr_progress = 0.0;
+	ib_uint64_t		read_rows = 0;
+	ib_uint64_t		table_total_rows = 0;
+	char			new_sys_trx_start[8];
+	char			new_sys_trx_end[8];
+	byte			any_autoinc_data[8] = {0};
+	bool			vers_update_trt = false;
+
+	DBUG_ENTER("row_merge_read_clustered_index");
+
+	ut_ad((old_table == new_table) == !col_map);
+	ut_ad(!defaults || col_map);
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+	ut_ad(trx->id);
+
+	table_total_rows = dict_table_get_n_rows(old_table);
+	if(table_total_rows == 0) {
+		/* We don't know total row count */
+		table_total_rows = 1;
+	}
+
+	trx->op_info = "reading clustered index";
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	DEBUG_FTS_SORT_PRINT("FTS_SORT: Start Create Index\n");
+#endif
+
+	/* Create and initialize memory for record buffers */
+
+	merge_buf = static_cast<row_merge_buf_t**>(
+		ut_malloc_nokey(n_index * sizeof *merge_buf));
+
+	row_merge_dup_t	clust_dup = {index[0], table, col_map, 0};
+	dfield_t*	prev_fields = nullptr;
+	const ulint	n_uniq = dict_index_get_n_unique(index[0]);
+
+	ut_ad(trx->mysql_thd != NULL);
+
+	const char*	path = thd_innodb_tmpdir(trx->mysql_thd);
+
+	ut_ad(!skip_pk_sort || dict_index_is_clust(index[0]));
+	/* There is no previous tuple yet. */
+	prev_mtuple.fields = NULL;
+
+	for (ulint i = 0; i < n_index; i++) {
+		if (index[i]->type & DICT_FTS) {
+
+			/* We are building a FT index, make sure
+			we have the temporary 'fts_sort_idx' */
+			ut_a(fts_sort_idx);
+
+			fts_index = index[i];
+
+			merge_buf[i] = row_merge_buf_create(fts_sort_idx);
+
+			add_doc_id = DICT_TF2_FLAG_IS_SET(
+				new_table, DICT_TF2_FTS_ADD_DOC_ID);
+
+			/* If Doc ID does not exist in the table itself,
+			fetch the first FTS Doc ID */
+			if (add_doc_id) {
+				fts_get_next_doc_id(
+					(dict_table_t*) new_table,
+					&doc_id);
+				ut_ad(doc_id > 0);
+			}
+
+			row_fts_start_psort(psort_info);
+			fts_parallel_sort_cond =
+				 &psort_info[0].psort_common->sort_cond;
+		} else {
+			if (dict_index_is_spatial(index[i])) {
+				num_spatial++;
+			}
+
+			merge_buf[i] = row_merge_buf_create(index[i]);
+		}
+	}
+
+	if (num_spatial > 0) {
+		ulint	count = 0;
+
+		sp_tuples = static_cast<spatial_index_info**>(
+			ut_malloc_nokey(num_spatial
+					* sizeof(*sp_tuples)));
+
+		for (ulint i = 0; i < n_index; i++) {
+			if (dict_index_is_spatial(index[i])) {
+				sp_tuples[count]
+					= UT_NEW_NOKEY(
+						spatial_index_info(index[i]));
+				count++;
+			}
+		}
+
+		ut_ad(count == num_spatial);
+	}
+
+	mtr.start();
+	mtr_started = true;
+
+	/* Find the clustered index and create a persistent cursor
+	based on that. */
+
+	clust_index = dict_table_get_first_index(old_table);
+	const ulint old_trx_id_col = ulint(old_table->n_cols)
+		- (DATA_N_SYS_COLS - DATA_TRX_ID);
+	ut_ad(old_table->cols[old_trx_id_col].mtype == DATA_SYS);
+	ut_ad(old_table->cols[old_trx_id_col].prtype
+	      == (DATA_TRX_ID | DATA_NOT_NULL));
+	ut_ad(old_table->cols[old_trx_id_col + 1].mtype == DATA_SYS);
+	ut_ad(old_table->cols[old_trx_id_col + 1].prtype
+	      == (DATA_ROLL_PTR | DATA_NOT_NULL));
+	const ulint new_trx_id_col = col_map
+		? col_map[old_trx_id_col] : old_trx_id_col;
+	uint64_t n_rows = 0;
+
+	err = pcur.open_leaf(true, clust_index, BTR_SEARCH_LEAF, &mtr);
+	if (err != DB_SUCCESS) {
+err_exit:
+		trx->error_key_num = 0;
+		goto func_exit;
+	} else {
+		rec_t* rec = page_rec_get_next(btr_pcur_get_rec(&pcur));
+		if (!rec) {
+corrupted_metadata:
+			err = DB_CORRUPTION;
+			goto err_exit;
+		}
+		if (rec_get_info_bits(rec, page_rec_is_comp(rec))
+		    & REC_INFO_MIN_REC_FLAG) {
+			if (!clust_index->is_instant()) {
+				goto corrupted_metadata;
+			}
+			if (page_rec_is_comp(rec)
+			    && rec_get_status(rec) != REC_STATUS_INSTANT) {
+				goto corrupted_metadata;
+			}
+			/* Skip the metadata pseudo-record. */
+			btr_pcur_get_page_cur(&pcur)->rec = rec;
+		} else if (clust_index->is_instant()) {
+			goto corrupted_metadata;
+		}
+	}
+
+	/* Check if the table is supposed to be empty for our read view.
+
+	If we read bulk_trx_id as an older transaction ID, it is not
+	incorrect to check here whether that transaction should be
+	visible to us. If bulk_trx_id is not visible to us, the table
+	must have been empty at an earlier point of time, also in our
+	read view.
+
+	An INSERT would only update bulk_trx_id in
+	row_ins_clust_index_entry_low() if the table really was empty
+	(everything had been purged), when holding a leaf page latch
+	in the clustered index (actually, the root page is the only
+	leaf page in that case).
+
+	We are holding a clustered index leaf page latch here.
+	That will obviously prevent any concurrent INSERT from
+	updating bulk_trx_id while we read it. */
+	if (!online) {
+	} else if (trx_id_t bulk_trx_id = old_table->bulk_trx_id) {
+		ut_ad(trx->read_view.is_open());
+		ut_ad(bulk_trx_id != trx->id);
+		if (!trx->read_view.changes_visible(bulk_trx_id)) {
+			goto func_exit;
+		}
+	}
+
+	if (old_table != new_table) {
+		/* The table is being rebuilt.  Identify the columns
+		that were flagged NOT NULL in the new table, so that
+		we can quickly check that the records in the old table
+		do not violate the added NOT NULL constraints. */
+
+		nonnull = static_cast<ulint*>(
+			ut_malloc_nokey(dict_table_get_n_cols(new_table)
+				  * sizeof *nonnull));
+
+		for (ulint i = 0; i < dict_table_get_n_cols(old_table); i++) {
+			if (dict_table_get_nth_col(old_table, i)->prtype
+			    & DATA_NOT_NULL) {
+				continue;
+			}
+
+			const ulint j = col_map[i];
+
+			if (j == ULINT_UNDEFINED) {
+				/* The column was dropped. */
+				continue;
+			}
+
+			if (dict_table_get_nth_col(new_table, j)->prtype
+			    & DATA_NOT_NULL) {
+				nonnull[n_nonnull++] = j;
+			}
+		}
+
+		if (!n_nonnull) {
+			ut_free(nonnull);
+			nonnull = NULL;
+		}
+	}
+
+	row_heap = mem_heap_create(sizeof(mrec_buf_t));
+
+	if (dict_table_is_comp(old_table)
+	    && !dict_table_is_comp(new_table)) {
+		conv_heap = mem_heap_create(sizeof(mrec_buf_t));
+	}
+
+	if (skip_pk_sort) {
+		prev_fields = static_cast<dfield_t*>(
+			ut_malloc_nokey(n_uniq * sizeof *prev_fields));
+		mtuple_heap = mem_heap_create(sizeof(mrec_buf_t));
+	}
+
+	mach_write_to_8(new_sys_trx_start, trx->id);
+	mach_write_to_8(new_sys_trx_end, TRX_ID_MAX);
+
+	/* Scan the clustered index. */
+	for (;;) {
+		/* Do not continue if table pages are still encrypted */
+		if (!old_table->is_readable() || !new_table->is_readable()) {
+			err = DB_DECRYPTION_FAILED;
+			goto err_exit;
+		}
+
+		const rec_t*	rec;
+		trx_id_t	rec_trx_id;
+		rec_offs*	offsets;
+		dtuple_t*	row;
+		row_ext_t*	ext;
+		page_cur_t*	cur	= btr_pcur_get_page_cur(&pcur);
+		bool history_row, history_fts = false;
+
+		stage->n_pk_recs_inc();
+
+		if (!page_cur_move_to_next(cur)) {
+corrupted_rec:
+			err = DB_CORRUPTION;
+			goto err_exit;
+		}
+
+		if (page_cur_is_after_last(cur)) {
+
+			stage->inc();
+
+			if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
+				err = DB_INTERRUPTED;
+				goto err_exit;
+			}
+
+			if (online && old_table != new_table) {
+				err = row_log_table_get_error(clust_index);
+				if (err != DB_SUCCESS) {
+					goto err_exit;
+				}
+			}
+
+			/* Insert the cached spatial index rows. */
+			err = row_merge_spatial_rows(
+				trx->id, sp_tuples, num_spatial,
+				row_heap, &pcur, mtr_started, &mtr);
+
+			if (err != DB_SUCCESS) {
+				goto func_exit;
+			}
+
+			mem_heap_empty(row_heap);
+
+			if (!mtr_started) {
+				goto scan_next;
+			}
+
+			if (clust_index->lock.is_waiting()) {
+				/* There are waiters on the clustered
+				index tree lock, likely the purge
+				thread. Store and restore the cursor
+				position, and yield so that scanning a
+				large table will not starve other
+				threads. */
+
+				/* Store the cursor position on the last user
+				record on the page. */
+				if (!btr_pcur_move_to_prev_on_page(&pcur)) {
+					goto corrupted_index;
+				}
+				/* Leaf pages must never be empty, unless
+				this is the only page in the index tree. */
+				if (!btr_pcur_is_on_user_rec(&pcur)
+				    && btr_pcur_get_block(&pcur)->page.id()
+				    .page_no() != clust_index->page) {
+					goto corrupted_index;
+				}
+
+				btr_pcur_store_position(&pcur, &mtr);
+				mtr.commit();
+				mtr_started = false;
+
+				/* Give the waiters a chance to proceed. */
+				std::this_thread::yield();
+scan_next:
+				ut_ad(!mtr_started);
+				ut_ad(!mtr.is_active());
+				mtr.start();
+				mtr_started = true;
+				/* Restore position on the record, or its
+				predecessor if the record was purged
+				meanwhile. */
+				if (pcur.restore_position(BTR_SEARCH_LEAF,
+							  &mtr)
+				    == btr_pcur_t::CORRUPTED) {
+corrupted_index:
+					err = DB_CORRUPTION;
+					goto func_exit;
+                                }
+				/* Move to the successor of the
+				original record. */
+				if (!btr_pcur_move_to_next_user_rec(
+					    &pcur, &mtr)) {
+end_of_index:
+					row = NULL;
+					mtr.commit();
+					mtr_started = false;
+					mem_heap_free(row_heap);
+					row_heap = NULL;
+					ut_free(nonnull);
+					nonnull = NULL;
+					goto write_buffers;
+				}
+			} else {
+				uint32_t next_page_no = btr_page_get_next(
+					page_cur_get_page(cur));
+
+				if (next_page_no == FIL_NULL) {
+					goto end_of_index;
+				}
+
+				buf_block_t* block = buf_page_get_gen(
+					page_id_t(old_table->space->id,
+						  next_page_no),
+					old_table->space->zip_size(),
+					RW_S_LATCH, nullptr, BUF_GET, &mtr,
+					&err, false);
+				if (!block) {
+					goto err_exit;
+				}
+
+				page_cur_set_before_first(block, cur);
+				if (!page_cur_move_to_next(cur)
+				    || page_cur_is_after_last(cur)) {
+					goto corrupted_rec;
+				}
+
+				const auto s = mtr.get_savepoint();
+				mtr.rollback_to_savepoint(s - 2, s - 1);
+			}
+		} else {
+			mem_heap_empty(row_heap);
+		}
+
+		rec = page_cur_get_rec(cur);
+
+		if (online) {
+			offsets = rec_get_offsets(rec, clust_index, NULL,
+						  clust_index->n_core_fields,
+						  ULINT_UNDEFINED, &row_heap);
+			rec_trx_id = row_get_rec_trx_id(rec, clust_index,
+							offsets);
+
+			/* Perform a REPEATABLE READ.
+
+			When rebuilding the table online,
+			row_log_table_apply() must not see a newer
+			state of the table when applying the log.
+			This is mainly to prevent false duplicate key
+			errors, because the log will identify records
+			by the PRIMARY KEY, and also to prevent unsafe
+			BLOB access.
+
+			When creating a secondary index online, this
+			table scan must not see records that have only
+			been inserted to the clustered index, but have
+			not been written to the online_log of
+			index[]. If we performed READ UNCOMMITTED, it
+			could happen that the ADD INDEX reaches
+			ONLINE_INDEX_COMPLETE state between the time
+			the DML thread has updated the clustered index
+			but has not yet accessed secondary index. */
+			ut_ad(trx->read_view.is_open());
+			ut_ad(rec_trx_id != trx->id);
+
+			if (!trx->read_view.changes_visible(rec_trx_id)) {
+				if (rec_trx_id
+				    >= trx->read_view.low_limit_id()
+				    && rec_trx_id
+				    >= trx_sys.get_max_trx_id()) {
+					goto corrupted_rec;
+				}
+
+				rec_t*	old_vers;
+
+				row_vers_build_for_consistent_read(
+					rec, &mtr, clust_index, &offsets,
+					&trx->read_view, &row_heap,
+					row_heap, &old_vers, NULL);
+
+				if (!old_vers) {
+					continue;
+				}
+
+				/* The old version must necessarily be
+				in the "prehistory", because the
+				exclusive lock in
+				ha_innobase::prepare_inplace_alter_table()
+				forced the completion of any transactions
+				that accessed this table. */
+				ut_ad(row_get_rec_trx_id(old_vers, clust_index,
+							 offsets) < trx->id);
+
+				rec = old_vers;
+				rec_trx_id = 0;
+			}
+
+			if (rec_get_deleted_flag(
+				    rec,
+				    dict_table_is_comp(old_table))) {
+				/* In delete-marked records, DB_TRX_ID must
+				always refer to an existing undo log record.
+				Above, we did reset rec_trx_id = 0
+				for rec = old_vers.*/
+				ut_ad(rec == page_cur_get_rec(cur)
+				      ? rec_trx_id
+				      : !rec_trx_id);
+				/* This record was deleted in the latest
+				committed version, or it was deleted and
+				then reinserted-by-update before purge
+				kicked in. Skip it. */
+				continue;
+			}
+
+			ut_ad(!rec_offs_any_null_extern(rec, offsets));
+		} else if (rec_get_deleted_flag(
+				   rec, dict_table_is_comp(old_table))) {
+			/* In delete-marked records, DB_TRX_ID must
+			always refer to an existing undo log record. */
+			ut_d(rec_trx_id = rec_get_trx_id(rec, clust_index));
+			ut_ad(rec_trx_id);
+			/* This must be a purgeable delete-marked record,
+			and the transaction that delete-marked the record
+			must have been committed before this
+			!online ALTER TABLE transaction. */
+			ut_ad(rec_trx_id < trx->id);
+			/* Skip delete-marked records.
+
+			Skipping delete-marked records will make the
+			created indexes unuseable for transactions
+			whose read views were created before the index
+			creation completed, but an attempt to preserve
+			the history would make it tricky to detect
+			duplicate keys. */
+			continue;
+		} else {
+			offsets = rec_get_offsets(rec, clust_index, NULL,
+						  clust_index->n_core_fields,
+						  ULINT_UNDEFINED, &row_heap);
+			/* This is a locking ALTER TABLE.
+
+			If we are not rebuilding the table, the
+			DB_TRX_ID does not matter, as it is not being
+			written to any secondary indexes; see
+			if (old_table == new_table) below.
+
+			If we are rebuilding the table, the
+			DB_TRX_ID,DB_ROLL_PTR should be reset, because
+			there will be no history available. */
+			ut_ad(rec_get_trx_id(rec, clust_index) < trx->id);
+			rec_trx_id = 0;
+		}
+
+		/* When !online, we are holding a lock on old_table, preventing
+		any inserts that could have written a record 'stub' before
+		writing out off-page columns. */
+		ut_ad(!rec_offs_any_null_extern(rec, offsets));
+
+		/* Build a row based on the clustered index. */
+
+		row = row_build_w_add_vcol(ROW_COPY_POINTERS, clust_index,
+					   rec, offsets, new_table,
+					   defaults, add_v, col_map, &ext,
+					   row_heap);
+		ut_ad(row);
+
+		history_row = new_table->versioned()
+		       && dtuple_get_nth_field(row, new_table->vers_end)
+		       ->vers_history_row();
+		history_fts = history_row && new_table->fts;
+
+		for (ulint i = 0; i < n_nonnull; i++) {
+			dfield_t*	field	= &row->fields[nonnull[i]];
+
+			ut_ad(dfield_get_type(field)->prtype & DATA_NOT_NULL);
+
+			if (dfield_is_null(field)) {
+
+				Field* null_field =
+					table->field[nonnull[i]];
+
+				null_field->set_warning(
+					Sql_condition::WARN_LEVEL_WARN,
+					WARN_DATA_TRUNCATED, 1,
+					ulong(n_rows + 1));
+
+				if (!allow_not_null) {
+					err = DB_INVALID_NULL;
+					goto err_exit;
+				}
+
+				const dfield_t& default_field
+					= defaults->fields[nonnull[i]];
+
+				*field = default_field;
+			}
+		}
+
+		/* Get the next Doc ID */
+		if (add_doc_id && !history_fts) {
+			doc_id++;
+		} else {
+			doc_id = 0;
+		}
+
+		ut_ad(row->fields[new_trx_id_col].type.mtype == DATA_SYS);
+		ut_ad(row->fields[new_trx_id_col].type.prtype
+		      == (DATA_TRX_ID | DATA_NOT_NULL));
+		ut_ad(row->fields[new_trx_id_col].len == DATA_TRX_ID_LEN);
+		ut_ad(row->fields[new_trx_id_col + 1].type.mtype == DATA_SYS);
+		ut_ad(row->fields[new_trx_id_col + 1].type.prtype
+		      == (DATA_ROLL_PTR | DATA_NOT_NULL));
+		ut_ad(row->fields[new_trx_id_col + 1].len == DATA_ROLL_PTR_LEN);
+
+		if (old_table == new_table) {
+			/* Do not bother touching DB_TRX_ID,DB_ROLL_PTR
+			because they are not going to be written into
+			secondary indexes. */
+		} else if (rec_trx_id < trx->id) {
+			/* Reset the DB_TRX_ID,DB_ROLL_PTR of old rows
+			for which history is not going to be
+			available after the rebuild operation.
+			This essentially mimics row_purge_reset_trx_id(). */
+			row->fields[new_trx_id_col].data
+				= const_cast<byte*>(reset_trx_id);
+			row->fields[new_trx_id_col + 1].data
+				= const_cast<byte*>(reset_trx_id
+						    + DATA_TRX_ID_LEN);
+		}
+
+		if (add_autoinc != ULINT_UNDEFINED) {
+
+			ut_ad(add_autoinc
+			      < dict_table_get_n_user_cols(new_table));
+
+			dfield_t* dfield = dtuple_get_nth_field(row,
+								add_autoinc);
+
+			if (new_table->versioned()) {
+				if (history_row) {
+					if (dfield_get_type(dfield)->prtype & DATA_NOT_NULL) {
+						err = DB_UNSUPPORTED;
+						my_error(ER_UNSUPPORTED_EXTENSION, MYF(0),
+							 old_table->name.m_name);
+						goto func_exit;
+					}
+					dfield_set_null(dfield);
+				} else {
+					// set not null
+					ulint len = dfield_get_type(dfield)->len;
+					dfield_set_data(dfield, any_autoinc_data, len);
+				}
+			}
+
+			if (dfield_is_null(dfield)) {
+				goto write_buffers;
+			}
+
+			const dtype_t*  dtype = dfield_get_type(dfield);
+			byte*	b = static_cast<byte*>(dfield_get_data(dfield));
+
+			if (sequence.eof()) {
+				ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+					ER_AUTOINC_READ_FAILED, "[NULL]");
+				err = DB_ERROR;
+				goto err_exit;
+			}
+
+			ulonglong	value = sequence++;
+
+			switch (dtype_get_mtype(dtype)) {
+			case DATA_INT: {
+				ibool	usign;
+				ulint	len = dfield_get_len(dfield);
+
+				usign = dtype_get_prtype(dtype) & DATA_UNSIGNED;
+				mach_write_ulonglong(b, value, len, usign);
+
+				break;
+				}
+
+			case DATA_FLOAT:
+				mach_float_write(
+					b, static_cast<float>(value));
+				break;
+
+			case DATA_DOUBLE:
+				mach_double_write(
+					b, static_cast<double>(value));
+				break;
+
+			default:
+				ut_ad(0);
+			}
+		}
+
+		if (old_table->versioned()) {
+			if (!new_table->versioned()
+			    && clust_index->vers_history_row(rec, offsets)) {
+				continue;
+			}
+		} else if (new_table->versioned()) {
+			dfield_t* start =
+			    dtuple_get_nth_field(row, new_table->vers_start);
+			dfield_t* end =
+			    dtuple_get_nth_field(row, new_table->vers_end);
+			dfield_set_data(start, new_sys_trx_start, 8);
+			dfield_set_data(end, new_sys_trx_end, 8);
+			vers_update_trt = true;
+		}
+
+write_buffers:
+		/* Build all entries for all the indexes to be created
+		in a single scan of the clustered index. */
+
+		n_rows++;
+		ulint	s_idx_cnt = 0;
+		bool	skip_sort = skip_pk_sort
+			&& dict_index_is_clust(merge_buf[0]->index);
+
+		for (ulint k = 0, i = 0; i < n_index; i++, skip_sort = false) {
+			row_merge_buf_t*	buf	= merge_buf[i];
+			ulint			rows_added = 0;
+
+			if (dict_index_is_spatial(buf->index)) {
+				if (!row) {
+					continue;
+				}
+
+				ut_ad(sp_tuples[s_idx_cnt]->index
+				      == buf->index);
+
+				/* If the geometry field is invalid, report
+				error. */
+				if (!row_geo_field_is_valid(row, buf->index)) {
+					err = DB_CANT_CREATE_GEOMETRY_OBJECT;
+					break;
+				}
+
+				sp_tuples[s_idx_cnt]->add(row, ext, buf->heap);
+				s_idx_cnt++;
+
+				continue;
+			}
+
+			ut_ad(!row
+			      || !dict_index_is_clust(buf->index)
+			      || trx_id_check(row->fields[new_trx_id_col].data,
+					      trx->id));
+
+			merge_file_t*	file = &files[k++];
+
+			if (UNIV_LIKELY
+			    (row && (rows_added = row_merge_buf_add(
+					buf, fts_index, old_table, new_table,
+					psort_info, row, ext, history_fts,
+					&doc_id, conv_heap, &err,
+					&v_heap, eval_table, trx,
+					col_collate)))) {
+
+				/* If we are creating FTS index,
+				a single row can generate more
+				records for tokenized word */
+				file->n_rec += rows_added;
+
+				if (err != DB_SUCCESS) {
+					ut_ad(err == DB_TOO_BIG_RECORD);
+					break;
+				}
+
+				if (doc_id > max_doc_id) {
+					max_doc_id = doc_id;
+				}
+
+				if (buf->index->type & DICT_FTS) {
+					/* Check if error occurs in child thread */
+					for (ulint j = 0;
+					     j < fts_sort_pll_degree; j++) {
+						if (psort_info[j].error
+							!= DB_SUCCESS) {
+							err = psort_info[j].error;
+							trx->error_key_num = i;
+							break;
+						}
+					}
+
+					if (err != DB_SUCCESS) {
+						break;
+					}
+				}
+
+				if (skip_sort) {
+					ut_ad(buf->n_tuples > 0);
+					const mtuple_t*	curr =
+						&buf->tuples[buf->n_tuples - 1];
+
+					ut_ad(i == 0);
+					ut_ad(dict_index_is_clust(merge_buf[0]->index));
+					/* Detect duplicates by comparing the
+					current record with previous record.
+					When temp file is not used, records
+					should be in sorted order. */
+					if (prev_mtuple.fields != NULL
+					    && (row_mtuple_cmp(
+						&prev_mtuple, curr,
+						&clust_dup) == 0)) {
+
+						err = DB_DUPLICATE_KEY;
+						trx->error_key_num
+							= key_numbers[0];
+						goto func_exit;
+					}
+
+					prev_mtuple.fields = curr->fields;
+				}
+
+				continue;
+			}
+
+			if (err == DB_COMPUTE_VALUE_FAILED) {
+				trx->error_key_num = i;
+				goto func_exit;
+			}
+
+			if (buf->index->type & DICT_FTS) {
+				if (!row || !doc_id) {
+					continue;
+				}
+			}
+
+			/* The buffer must be sufficiently large
+			to hold at least one record. It may only
+			be empty when we reach the end of the
+			clustered index. row_merge_buf_add()
+			must not have been called in this loop. */
+			ut_ad(buf->n_tuples || row == NULL);
+
+			/* We have enough data tuples to form a block.
+			Sort them and write to disk if temp file is used
+			or insert into index if temp file is not used. */
+			ut_ad(old_table == new_table
+			      ? !dict_index_is_clust(buf->index)
+			      : (i == 0) == dict_index_is_clust(buf->index));
+
+			/* We have enough data tuples to form a block.
+			Sort them (if !skip_sort) and write to disk. */
+
+			if (buf->n_tuples) {
+				if (skip_sort) {
+					/* Temporary File is not used.
+					so insert sorted block to the index */
+					if (row != NULL) {
+						/* We have to do insert the
+						cached spatial index rows, since
+						after the mtr_commit, the cluster
+						index page could be updated, then
+						the data in cached rows become
+						invalid. */
+						err = row_merge_spatial_rows(
+							trx->id, sp_tuples,
+							num_spatial,
+							row_heap,
+							&pcur, mtr_started,
+							&mtr);
+
+						if (err != DB_SUCCESS) {
+							goto func_exit;
+						}
+
+						/* We are not at the end of
+						the scan yet. We must
+						mtr.commit() in order to be
+						able to call log_free_check()
+						in row_merge_insert_index_tuples().
+						Due to mtr.commit(), the
+						current row will be invalid, and
+						we must reread it on the next
+						loop iteration. */
+						if (mtr_started) {
+							if (!btr_pcur_move_to_prev_on_page(&pcur)) {
+								err = DB_CORRUPTION;
+								goto func_exit;
+							}
+							btr_pcur_store_position(
+								&pcur, &mtr);
+
+							mtr.commit();
+							mtr_started = false;
+						}
+					}
+
+					mem_heap_empty(mtuple_heap);
+					prev_mtuple.fields = prev_fields;
+
+					row_mtuple_create(
+						&buf->tuples[buf->n_tuples - 1],
+						&prev_mtuple, n_uniq,
+						mtuple_heap);
+
+					if (clust_btr_bulk == NULL) {
+						clust_btr_bulk = UT_NEW_NOKEY(
+							BtrBulk(index[i],
+								trx));
+					} else {
+						clust_btr_bulk->latch();
+					}
+
+					err = row_merge_insert_index_tuples(
+						index[i], old_table,
+						OS_FILE_CLOSED, NULL, buf,
+						clust_btr_bulk,
+						table_total_rows,
+						curr_progress,
+						pct_cost,
+						crypt_block,
+						new_table->space_id);
+
+					if (row == NULL) {
+						err = clust_btr_bulk->finish(
+							err);
+						UT_DELETE(clust_btr_bulk);
+						clust_btr_bulk = NULL;
+					} else {
+						/* Release latches for possible
+						log_free_chck in spatial index
+						build. */
+						clust_btr_bulk->release();
+					}
+
+					if (err != DB_SUCCESS) {
+						break;
+					}
+
+					if (row != NULL) {
+						/* Restore the cursor on the
+						previous clustered index record,
+						and empty the buffer. The next
+						iteration of the outer loop will
+						advance the cursor and read the
+						next record (the one which we
+						had to ignore due to the buffer
+						overflow). */
+						mtr.start();
+						mtr_started = true;
+						if (pcur.restore_position(
+							BTR_SEARCH_LEAF, &mtr)
+						    == btr_pcur_t::CORRUPTED) {
+							goto corrupted_index;
+						}
+						buf = row_merge_buf_empty(buf);
+						merge_buf[i] = buf;
+						/* Restart the outer loop on the
+						record. We did not insert it
+						into any index yet. */
+						ut_ad(i == 0);
+						break;
+					}
+				} else if (dict_index_is_unique(buf->index)) {
+					row_merge_dup_t	dup = {
+						buf->index, table, col_map, 0};
+
+					row_merge_buf_sort(buf, &dup);
+
+					if (dup.n_dup) {
+						err = DB_DUPLICATE_KEY;
+						trx->error_key_num
+							= key_numbers[i];
+						break;
+					}
+				} else {
+					row_merge_buf_sort(buf, NULL);
+				}
+			} else if (online && new_table == old_table) {
+				/* Note the newest transaction that
+				modified this index when the scan was
+				completed. We prevent older readers
+				from accessing this index, to ensure
+				read consistency. */
+
+				ut_a(row == NULL);
+
+				dict_index_t* index = buf->index;
+				index->lock.x_lock(SRW_LOCK_CALL);
+				ut_a(dict_index_get_online_status(index)
+				     == ONLINE_INDEX_CREATION);
+
+				trx_id_t max_trx_id = row_log_get_max_trx(
+					index);
+
+				if (max_trx_id > index->trx_id) {
+					index->trx_id = max_trx_id;
+				}
+
+				index->lock.x_unlock();
+			}
+
+			/* Secondary index and clustered index which is
+			not in sorted order can use the temporary file.
+			Fulltext index should not use the temporary file. */
+			if (!skip_sort && !(buf->index->type & DICT_FTS)) {
+				/* In case we can have all rows in sort buffer,
+				we can insert directly into the index without
+				temporary file if clustered index does not uses
+				temporary file. */
+				if (row == NULL && file->fd == OS_FILE_CLOSED
+				    && !clust_temp_file) {
+					DBUG_EXECUTE_IF(
+						"row_merge_write_failure",
+						err = DB_TEMP_FILE_WRITE_FAIL;
+						trx->error_key_num = i;
+						goto all_done;);
+
+					DBUG_EXECUTE_IF(
+						"row_merge_tmpfile_fail",
+						err = DB_OUT_OF_MEMORY;
+						trx->error_key_num = i;
+						goto all_done;);
+
+					BtrBulk	btr_bulk(index[i], trx);
+
+					err = row_merge_insert_index_tuples(
+						index[i], old_table,
+						OS_FILE_CLOSED, NULL, buf,
+						&btr_bulk,
+						table_total_rows,
+						curr_progress,
+						pct_cost,
+						crypt_block,
+						new_table->space_id);
+
+					err = btr_bulk.finish(err);
+
+					DBUG_EXECUTE_IF(
+						"row_merge_insert_big_row",
+						err = DB_TOO_BIG_RECORD;);
+
+					if (err != DB_SUCCESS) {
+						break;
+					}
+				} else {
+					if (!row_merge_file_create_if_needed(
+						file, tmpfd,
+						buf->n_tuples, path)) {
+						err = DB_OUT_OF_MEMORY;
+						trx->error_key_num = i;
+						break;
+					}
+
+					/* Ensure that duplicates in the
+					clustered index will be detected before
+					inserting secondary index records. */
+					if (dict_index_is_clust(buf->index)) {
+						clust_temp_file = true;
+					}
+
+					ut_ad(file->n_rec > 0);
+
+					row_merge_buf_write(buf,
+#ifndef DBUG_OFF
+							    file,
+#endif
+							    block);
+
+					if (!row_merge_write(
+						    file->fd, file->offset++,
+						    block, crypt_block,
+						    new_table->space_id)) {
+						err = DB_TEMP_FILE_WRITE_FAIL;
+						trx->error_key_num = i;
+						break;
+					}
+
+					MEM_UNDEFINED(
+						&block[0], srv_sort_buf_size);
+				}
+			}
+			merge_buf[i] = row_merge_buf_empty(buf);
+			buf = merge_buf[i];
+
+			if (UNIV_LIKELY(row != NULL)) {
+				/* Try writing the record again, now
+				that the buffer has been written out
+				and emptied. */
+
+				if (UNIV_UNLIKELY
+				    (!(rows_added = row_merge_buf_add(
+						buf, fts_index, old_table,
+						new_table, psort_info,
+						row, ext, history_fts, &doc_id,
+						conv_heap, &err, &v_heap,
+						eval_table, trx, col_collate)))) {
+                                        /* An empty buffer should have enough
+                                        room for at least one record. */
+					ut_ad(err == DB_COMPUTE_VALUE_FAILED
+					      || err == DB_OUT_OF_MEMORY
+					      || err == DB_TOO_BIG_RECORD);
+				} else if (err == DB_SUCCESS) {
+					file->n_rec += rows_added;
+					continue;
+				}
+
+				trx->error_key_num = i;
+				break;
+			}
+		}
+
+		if (row == NULL) {
+			if (old_table != new_table) {
+				new_table->stat_n_rows = n_rows;
+			}
+
+			goto all_done;
+		}
+
+		if (err != DB_SUCCESS) {
+			goto func_exit;
+		}
+
+		if (v_heap) {
+			mem_heap_empty(v_heap);
+		}
+
+		/* Increment innodb_onlineddl_pct_progress status variable */
+		read_rows++;
+		if(read_rows % 1000 == 0) {
+			/* Update progress for each 1000 rows */
+			curr_progress = (read_rows >= table_total_rows) ?
+					pct_cost :
+				pct_cost * static_cast<double>(read_rows)
+				/ static_cast<double>(table_total_rows);
+			/* presenting 10.12% as 1012 integer */
+			onlineddl_pct_progress = (ulint) (curr_progress * 100);
+		}
+	}
+
+func_exit:
+	ut_ad(mtr_started == mtr.is_active());
+	if (mtr_started) {
+		mtr.commit();
+	}
+	if (row_heap) {
+		mem_heap_free(row_heap);
+	}
+	ut_free(nonnull);
+
+all_done:
+	if (clust_btr_bulk != NULL) {
+		ut_ad(err != DB_SUCCESS);
+		clust_btr_bulk->latch();
+		err = clust_btr_bulk->finish(
+			err);
+		UT_DELETE(clust_btr_bulk);
+	}
+
+	if (prev_fields) {
+		ut_free(prev_fields);
+		mem_heap_free(mtuple_heap);
+	}
+
+	if (v_heap) {
+		mem_heap_free(v_heap);
+	}
+
+	if (conv_heap != NULL) {
+		mem_heap_free(conv_heap);
+	}
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Scan Table\n");
+#endif
+	if (UNIV_LIKELY_NULL(fts_parallel_sort_cond)) {
+wait_again:
+                /* Check if error occurs in child thread */
+		for (ulint j = 0; j < fts_sort_pll_degree; j++) {
+			if (psort_info[j].error != DB_SUCCESS) {
+				err = psort_info[j].error;
+				trx->error_key_num = j;
+				break;
+			}
+		}
+
+		/* Tell all children that parent has done scanning */
+		for (ulint i = 0; i < fts_sort_pll_degree; i++) {
+			if (err == DB_SUCCESS) {
+				psort_info[i].state = FTS_PARENT_COMPLETE;
+			} else {
+				psort_info[i].state = FTS_PARENT_EXITING;
+			}
+		}
+
+		/* Now wait all children to report back to be completed */
+		timespec abstime;
+		set_timespec(abstime, 1);
+		mysql_mutex_lock(&psort_info[0].mutex);
+		my_cond_timedwait(fts_parallel_sort_cond,
+				  &psort_info[0].mutex.m_mutex, &abstime);
+		mysql_mutex_unlock(&psort_info[0].mutex);
+
+		for (ulint i = 0; i < fts_sort_pll_degree; i++) {
+			if (!psort_info[i].child_status) {
+				goto wait_again;
+			}
+		}
+
+		for (ulint j = 0; j < fts_sort_pll_degree; j++) {
+			psort_info[j].task->wait();
+			delete psort_info[j].task;
+		}
+	}
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Tokenization\n");
+#endif
+	for (ulint i = 0; i < n_index; i++) {
+		row_merge_buf_free(merge_buf[i]);
+	}
+
+	row_fts_free_pll_merge_buf(psort_info);
+
+	ut_free(merge_buf);
+	ut_free(pcur.old_rec_buf);
+
+	if (sp_tuples != NULL) {
+		for (ulint i = 0; i < num_spatial; i++) {
+			UT_DELETE(sp_tuples[i]);
+		}
+		ut_free(sp_tuples);
+	}
+
+	/* Update the next Doc ID we used. Table should be locked, so
+	no concurrent DML */
+	if (max_doc_id && err == DB_SUCCESS) {
+		/* Sync fts cache for other fts indexes to keep all
+		fts indexes consistent in sync_doc_id. */
+		err = fts_sync_table(const_cast<dict_table_t*>(new_table));
+
+		if (err == DB_SUCCESS) {
+			new_table->fts->cache->synced_doc_id = max_doc_id;
+
+			/* Update the max value as next FTS_DOC_ID */
+			if (max_doc_id >= new_table->fts->cache->next_doc_id) {
+				new_table->fts->cache->next_doc_id =
+					max_doc_id + 1;
+			}
+
+			new_table->fts->cache->first_doc_id =
+				new_table->fts->cache->next_doc_id;
+
+			err= fts_update_sync_doc_id(
+				new_table,
+				new_table->fts->cache->synced_doc_id,
+				NULL);
+		}
+	}
+
+	if (vers_update_trt) {
+		trx->mod_tables.emplace(new_table, 0)
+			.first->second.set_versioned(0);
+	}
+
+	trx->op_info = "";
+
+	DBUG_RETURN(err);
+}
+
+/** Write a record via buffer 2 and read the next record to buffer N.
+@param N number of the buffer (0 or 1)
+@param INDEX record descriptor
+@param AT_END statement to execute at end of input */
+#define ROW_MERGE_WRITE_GET_NEXT_LOW(N, INDEX, AT_END)			\
+	do {								\
+		b2 = row_merge_write_rec(&block[2 * srv_sort_buf_size], \
+					 &buf[2], b2,			\
+					 of->fd, &of->offset,		\
+					 mrec##N, offsets##N,		\
+			crypt_block ? &crypt_block[2 * srv_sort_buf_size] : NULL , \
+					space);				\
+		if (UNIV_UNLIKELY(!b2 || ++of->n_rec > file->n_rec)) {	\
+			goto corrupt;					\
+		}							\
+		b##N = row_merge_read_rec(&block[N * srv_sort_buf_size],\
+					  &buf[N], b##N, INDEX,		\
+					  file->fd, foffs##N,		\
+					  &mrec##N, offsets##N,		\
+			crypt_block ? &crypt_block[N * srv_sort_buf_size] : NULL, \
+					  space);			\
+									\
+		if (UNIV_UNLIKELY(!b##N)) {				\
+			if (mrec##N) {					\
+				goto corrupt;				\
+			}						\
+			AT_END;						\
+		}							\
+	} while (0)
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+#define ROW_MERGE_WRITE_GET_NEXT(N, INDEX, AT_END)			\
+	do {								\
+		if (stage != NULL) {					\
+			stage->inc();					\
+		}							\
+		ROW_MERGE_WRITE_GET_NEXT_LOW(N, INDEX, AT_END);		\
+	} while (0)
+#else /* HAVE_PSI_STAGE_INTERFACE */
+#define ROW_MERGE_WRITE_GET_NEXT(N, INDEX, AT_END)			\
+	ROW_MERGE_WRITE_GET_NEXT_LOW(N, INDEX, AT_END)
+#endif /* HAVE_PSI_STAGE_INTERFACE */
+
+/** Merge two blocks of records on disk and write a bigger block.
+@param[in]	dup	descriptor of index being created
+@param[in]	file	file containing index entries
+@param[in,out]	block	3 buffers
+@param[in,out]	foffs0	offset of first source list in the file
+@param[in,out]	foffs1	offset of second source list in the file
+@param[in,out]	of	output file
+@param[in,out]	stage	performance schema accounting object, used by
+ALTER TABLE. If not NULL stage->inc() will be called for each record
+processed.
+@param[in,out]	crypt_block	encryption buffer
+@param[in]	space	tablespace ID for encryption
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_merge_blocks(
+	const row_merge_dup_t*	dup,
+	const merge_file_t*	file,
+	row_merge_block_t*	block,
+	ulint*			foffs0,
+	ulint*			foffs1,
+	merge_file_t*		of,
+	ut_stage_alter_t*	stage MY_ATTRIBUTE((unused)),
+	row_merge_block_t*	crypt_block,
+	ulint			space)
+{
+	mem_heap_t*	heap;	/*!< memory heap for offsets0, offsets1 */
+
+	mrec_buf_t*	buf;	/*!< buffer for handling
+				split mrec in block[] */
+	const byte*	b0;	/*!< pointer to block[0] */
+	const byte*	b1;	/*!< pointer to block[srv_sort_buf_size] */
+	byte*		b2;	/*!< pointer to block[2 * srv_sort_buf_size] */
+	const mrec_t*	mrec0;	/*!< merge rec, points to block[0] or buf[0] */
+	const mrec_t*	mrec1;	/*!< merge rec, points to
+				block[srv_sort_buf_size] or buf[1] */
+	rec_offs*	offsets0;/* offsets of mrec0 */
+	rec_offs*	offsets1;/* offsets of mrec1 */
+
+	DBUG_ENTER("row_merge_blocks");
+	DBUG_LOG("ib_merge_sort",
+		 "fd=" << file->fd << ',' << *foffs0 << '+' << *foffs1
+		 << " to fd=" << of->fd << ',' << of->offset);
+
+	heap = row_merge_heap_create(dup->index, &buf, &offsets0, &offsets1);
+
+	/* Write a record and read the next record.  Split the output
+	file in two halves, which can be merged on the following pass. */
+
+	if (!row_merge_read(file->fd, *foffs0, &block[0],
+			    crypt_block ? &crypt_block[0] : NULL,
+			    space) ||
+	    !row_merge_read(file->fd, *foffs1, &block[srv_sort_buf_size],
+			    crypt_block ? &crypt_block[srv_sort_buf_size] : NULL,
+			    space)) {
+corrupt:
+		mem_heap_free(heap);
+		DBUG_RETURN(DB_CORRUPTION);
+	}
+
+	b0 = &block[0];
+	b1 = &block[srv_sort_buf_size];
+	b2 = &block[2 * srv_sort_buf_size];
+
+	b0 = row_merge_read_rec(
+		&block[0], &buf[0], b0, dup->index,
+		file->fd, foffs0, &mrec0, offsets0,
+		crypt_block ? &crypt_block[0] : NULL,
+		space);
+
+	b1 = row_merge_read_rec(
+		&block[srv_sort_buf_size],
+		&buf[srv_sort_buf_size], b1, dup->index,
+		file->fd, foffs1, &mrec1, offsets1,
+		crypt_block ? &crypt_block[srv_sort_buf_size] : NULL,
+		space);
+
+	if (UNIV_UNLIKELY(!b0 && mrec0)
+	    || UNIV_UNLIKELY(!b1 && mrec1)) {
+
+		goto corrupt;
+	}
+
+	while (mrec0 && mrec1) {
+		int cmp = cmp_rec_rec_simple(
+			mrec0, mrec1, offsets0, offsets1,
+			dup->index, dup->table);
+		if (cmp < 0) {
+			ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto merged);
+		} else if (cmp) {
+			ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto merged);
+		} else {
+			mem_heap_free(heap);
+			DBUG_RETURN(DB_DUPLICATE_KEY);
+		}
+	}
+
+merged:
+	if (mrec0) {
+		/* append all mrec0 to output */
+		for (;;) {
+			ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto done0);
+		}
+	}
+done0:
+	if (mrec1) {
+		/* append all mrec1 to output */
+		for (;;) {
+			ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto done1);
+		}
+	}
+done1:
+
+	mem_heap_free(heap);
+
+	b2 = row_merge_write_eof(
+		&block[2 * srv_sort_buf_size],
+		b2, of->fd, &of->offset,
+		crypt_block ? &crypt_block[2 * srv_sort_buf_size] : NULL,
+		space);
+	DBUG_RETURN(b2 ? DB_SUCCESS : DB_CORRUPTION);
+}
+
+/** Copy a block of index entries.
+@param[in]	index	index being created
+@param[in]	file	input file
+@param[in,out]	block	3 buffers
+@param[in,out]	foffs0	input file offset
+@param[in,out]	of	output file
+@param[in,out]	stage	performance schema accounting object, used by
+ALTER TABLE. If not NULL stage->inc() will be called for each record
+processed.
+@param[in,out]	crypt_block	encryption buffer
+@param[in]	space	tablespace ID for encryption
+@return TRUE on success, FALSE on failure */
+static MY_ATTRIBUTE((warn_unused_result))
+ibool
+row_merge_blocks_copy(
+	const dict_index_t*	index,
+	const merge_file_t*	file,
+	row_merge_block_t*	block,
+	ulint*			foffs0,
+	merge_file_t*		of,
+	ut_stage_alter_t*	stage MY_ATTRIBUTE((unused)),
+	row_merge_block_t*	crypt_block,
+	ulint			space)
+{
+	mem_heap_t*	heap;	/*!< memory heap for offsets0, offsets1 */
+
+	mrec_buf_t*	buf;	/*!< buffer for handling
+				split mrec in block[] */
+	const byte*	b0;	/*!< pointer to block[0] */
+	byte*		b2;	/*!< pointer to block[2 * srv_sort_buf_size] */
+	const mrec_t*	mrec0;	/*!< merge rec, points to block[0] */
+	rec_offs*	offsets0;/* offsets of mrec0 */
+	rec_offs*	offsets1;/* dummy offsets */
+
+	DBUG_ENTER("row_merge_blocks_copy");
+	DBUG_LOG("ib_merge_sort",
+		 "fd=" << file->fd << ',' << foffs0
+		 << " to fd=" << of->fd << ',' << of->offset);
+
+	heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
+
+	/* Write a record and read the next record.  Split the output
+	file in two halves, which can be merged on the following pass. */
+
+	if (!row_merge_read(file->fd, *foffs0, &block[0],
+			crypt_block ? &crypt_block[0] : NULL,
+			space)) {
+corrupt:
+		mem_heap_free(heap);
+		DBUG_RETURN(FALSE);
+	}
+
+	b0 = &block[0];
+
+	b2 = &block[2 * srv_sort_buf_size];
+
+	b0 = row_merge_read_rec(&block[0], &buf[0], b0, index,
+				file->fd, foffs0, &mrec0, offsets0,
+				crypt_block ? &crypt_block[0] : NULL,
+				space);
+
+	if (UNIV_UNLIKELY(!b0 && mrec0)) {
+
+		goto corrupt;
+	}
+
+	if (mrec0) {
+		/* append all mrec0 to output */
+		for (;;) {
+			ROW_MERGE_WRITE_GET_NEXT(0, index, goto done0);
+		}
+	}
+done0:
+
+	/* The file offset points to the beginning of the last page
+	that has been read.  Update it to point to the next block. */
+	(*foffs0)++;
+
+	mem_heap_free(heap);
+
+	DBUG_RETURN(row_merge_write_eof(
+			    &block[2 * srv_sort_buf_size],
+			    b2, of->fd, &of->offset,
+			    crypt_block
+			    ? &crypt_block[2 * srv_sort_buf_size]
+			    : NULL, space)
+		    != NULL);
+}
+
+/** Merge disk files.
+@param[in]	trx		transaction
+@param[in]	dup		descriptor of index being created
+@param[in,out]	file		file containing index entries
+@param[in,out]	block		3 buffers
+@param[in,out]	tmpfd		temporary file handle
+@param[in,out]	num_run		Number of runs that remain to be merged
+@param[in,out]	run_offset	Array that contains the first offset number
+for each merge run
+@param[in,out]	stage		performance schema accounting object, used by
+@param[in,out]	crypt_block	encryption buffer
+@param[in]	space		tablespace ID for encryption
+ALTER TABLE. If not NULL stage->inc() will be called for each record
+processed.
+@return DB_SUCCESS or error code */
+static
+dberr_t
+row_merge(
+	trx_t*			trx,
+	const row_merge_dup_t*	dup,
+	merge_file_t*		file,
+	row_merge_block_t*	block,
+	pfs_os_file_t*		tmpfd,
+	ulint*			num_run,
+	ulint*			run_offset,
+	ut_stage_alter_t*	stage,
+	row_merge_block_t*	crypt_block,
+	ulint			space)
+{
+	ulint		foffs0;	/*!< first input offset */
+	ulint		foffs1;	/*!< second input offset */
+	dberr_t		error;	/*!< error code */
+	merge_file_t	of;	/*!< output file */
+	const ulint	ihalf	= run_offset[*num_run / 2];
+				/*!< half the input file */
+	ulint		n_run	= 0;
+				/*!< num of runs generated from this merge */
+
+	MEM_CHECK_ADDRESSABLE(&block[0], 3 * srv_sort_buf_size);
+
+	if (crypt_block) {
+		MEM_CHECK_ADDRESSABLE(&crypt_block[0], 3 * srv_sort_buf_size);
+	}
+
+	ut_ad(ihalf < file->offset);
+
+	of.fd = *tmpfd;
+	of.offset = 0;
+	of.n_rec = 0;
+
+#ifdef POSIX_FADV_SEQUENTIAL
+	/* The input file will be read sequentially, starting from the
+	beginning and the middle.  In Linux, the POSIX_FADV_SEQUENTIAL
+	affects the entire file.  Each block will be read exactly once. */
+	posix_fadvise(file->fd, 0, 0,
+		      POSIX_FADV_SEQUENTIAL | POSIX_FADV_NOREUSE);
+#endif /* POSIX_FADV_SEQUENTIAL */
+
+	/* Merge blocks to the output file. */
+	foffs0 = 0;
+	foffs1 = ihalf;
+
+	MEM_UNDEFINED(run_offset, *num_run * sizeof *run_offset);
+
+	for (; foffs0 < ihalf && foffs1 < file->offset; foffs0++, foffs1++) {
+
+		if (trx_is_interrupted(trx)) {
+			return(DB_INTERRUPTED);
+		}
+
+		/* Remember the offset number for this run */
+		run_offset[n_run++] = of.offset;
+
+		error = row_merge_blocks(dup, file, block,
+					 &foffs0, &foffs1, &of, stage,
+					 crypt_block, space);
+
+		if (error != DB_SUCCESS) {
+			return(error);
+		}
+
+	}
+
+	/* Copy the last blocks, if there are any. */
+
+	while (foffs0 < ihalf) {
+
+		if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
+			return(DB_INTERRUPTED);
+		}
+
+		/* Remember the offset number for this run */
+		run_offset[n_run++] = of.offset;
+
+		if (!row_merge_blocks_copy(dup->index, file, block,
+					   &foffs0, &of, stage,
+					   crypt_block, space)) {
+			return(DB_CORRUPTION);
+		}
+	}
+
+	ut_ad(foffs0 == ihalf);
+
+	while (foffs1 < file->offset) {
+
+		if (trx_is_interrupted(trx)) {
+			return(DB_INTERRUPTED);
+		}
+
+		/* Remember the offset number for this run */
+		run_offset[n_run++] = of.offset;
+
+		if (!row_merge_blocks_copy(dup->index, file, block,
+					   &foffs1, &of, stage,
+					   crypt_block, space)) {
+			return(DB_CORRUPTION);
+		}
+	}
+
+	ut_ad(foffs1 == file->offset);
+
+	if (UNIV_UNLIKELY(of.n_rec != file->n_rec)) {
+		return(DB_CORRUPTION);
+	}
+
+	ut_ad(n_run <= *num_run);
+
+	*num_run = n_run;
+
+	/* Each run can contain one or more offsets. As merge goes on,
+	the number of runs (to merge) will reduce until we have one
+	single run. So the number of runs will always be smaller than
+	the number of offsets in file */
+	ut_ad((*num_run) <= file->offset);
+
+	/* The number of offsets in output file is always equal or
+	smaller than input file */
+	ut_ad(of.offset <= file->offset);
+
+	/* Swap file descriptors for the next pass. */
+	*tmpfd = file->fd;
+	*file = of;
+
+	MEM_UNDEFINED(&block[0], 3 * srv_sort_buf_size);
+
+	return(DB_SUCCESS);
+}
+
+/** Merge disk files.
+@param[in]	trx	transaction
+@param[in]	dup	descriptor of index being created
+@param[in,out]	file	file containing index entries
+@param[in,out]	block	3 buffers
+@param[in,out]	tmpfd	temporary file handle
+@param[in,out]	stage	performance schema accounting object, used by
+ALTER TABLE. If not NULL, stage->begin_phase_sort() will be called initially
+and then stage->inc() will be called for each record processed.
+@return DB_SUCCESS or error code */
+dberr_t
+row_merge_sort(
+	trx_t*			trx,
+	const row_merge_dup_t*	dup,
+	merge_file_t*		file,
+	row_merge_block_t*	block,
+	pfs_os_file_t*			tmpfd,
+	const bool		update_progress,
+					/*!< in: update progress
+					status variable or not */
+	const double 		pct_progress,
+					/*!< in: total progress percent
+					until now */
+	const double		pct_cost, /*!< in: current progress percent */
+	row_merge_block_t*	crypt_block, /*!< in: crypt buf or NULL */
+	ulint			space,	   /*!< in: space id */
+	ut_stage_alter_t* 	stage)
+{
+	const ulint	half	= file->offset / 2;
+	ulint		num_runs;
+	ulint*		run_offset;
+	dberr_t		error	= DB_SUCCESS;
+	ulint		merge_count = 0;
+	ulint		total_merge_sort_count;
+	double		curr_progress = 0;
+
+	DBUG_ENTER("row_merge_sort");
+
+	/* Record the number of merge runs we need to perform */
+	num_runs = file->offset;
+
+	if (stage != NULL) {
+		stage->begin_phase_sort(log2(double(num_runs)));
+	}
+
+	/* If num_runs are less than 1, nothing to merge */
+	if (num_runs <= 1) {
+		DBUG_RETURN(error);
+	}
+
+	total_merge_sort_count = ulint(ceil(log2(double(num_runs))));
+
+	/* "run_offset" records each run's first offset number */
+	run_offset = (ulint*) ut_malloc_nokey(file->offset * sizeof(ulint));
+
+	/* This tells row_merge() where to start for the first round
+	of merge. */
+	run_offset[half] = half;
+
+	/* The file should always contain at least one byte (the end
+	of file marker).  Thus, it must be at least one block. */
+	ut_ad(file->offset > 0);
+
+	/* These thd_progress* calls will crash on sol10-64 when innodb_plugin
+	is used. MDEV-9356: innodb.innodb_bug53290 fails (crashes) on
+	sol10-64 in buildbot.
+	*/
+#ifndef __sun__
+	/* Progress report only for "normal" indexes. */
+	if (dup && !(dup->index->type & DICT_FTS)) {
+		thd_progress_init(trx->mysql_thd, 1);
+	}
+#endif /* __sun__ */
+
+	if (global_system_variables.log_warnings > 2) {
+		sql_print_information("InnoDB: Online DDL : merge-sorting"
+				      " has estimated " ULINTPF " runs",
+				      num_runs);
+	}
+
+	/* Merge the runs until we have one big run */
+	do {
+		/* Report progress of merge sort to MySQL for
+		show processlist progress field */
+		/* Progress report only for "normal" indexes. */
+#ifndef __sun__
+		if (dup && !(dup->index->type & DICT_FTS)) {
+			thd_progress_report(trx->mysql_thd, file->offset - num_runs, file->offset);
+		}
+#endif /* __sun__ */
+
+		error = row_merge(trx, dup, file, block, tmpfd,
+				  &num_runs, run_offset, stage,
+				  crypt_block, space);
+
+		if(update_progress) {
+			merge_count++;
+			curr_progress = (merge_count >= total_merge_sort_count) ?
+				pct_cost :
+				pct_cost * static_cast<double>(merge_count)
+				/ static_cast<double>(total_merge_sort_count);
+			/* presenting 10.12% as 1012 integer */;
+			onlineddl_pct_progress = (ulint) ((pct_progress + curr_progress) * 100);
+		}
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		MEM_CHECK_DEFINED(run_offset, num_runs * sizeof *run_offset);
+	} while (num_runs > 1);
+
+	ut_free(run_offset);
+
+	/* Progress report only for "normal" indexes. */
+#ifndef __sun__
+	if (dup && !(dup->index->type & DICT_FTS)) {
+		thd_progress_end(trx->mysql_thd);
+	}
+#endif /* __sun__ */
+
+	DBUG_RETURN(error);
+}
+
+/** Copy the blob from the given blob file and store it
+in field data for the tuple
+@param tuple     tuple to be inserted
+@param heap      heap to allocate the memory for the blob storage
+@param blob_file file to handle blob data */
+static dberr_t row_merge_copy_blob_from_file(dtuple_t *tuple, mem_heap_t *heap,
+                                             merge_file_t *blob_file)
+{
+  for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++)
+  {
+    dfield_t *field= dtuple_get_nth_field(tuple, i);
+    const byte *field_data= static_cast<byte*>(dfield_get_data(field));
+    ulint field_len= dfield_get_len(field);
+    if (!dfield_is_ext(field))
+      continue;
+
+    ut_a(field_len >= BTR_EXTERN_FIELD_REF_SIZE);
+    ut_ad(!dfield_is_null(field));
+
+    ut_ad(mach_read_from_8(field_data) == 0);
+    uint64_t offset= mach_read_from_8(field_data + 8);
+    uint32_t len= mach_read_from_4(field_data + 16);
+
+    byte *data= (byte*) mem_heap_alloc(heap, len);
+    if (dberr_t err= os_file_read(IORequestRead, blob_file->fd, data,
+                                  offset, len, nullptr))
+      return err;
+    dfield_set_data(field, data, len);
+  }
+
+  return DB_SUCCESS;
+}
+
+/** Copy externally stored columns to the data tuple.
+@param[in]	mrec		record containing BLOB pointers,
+or NULL to use tuple instead
+@param[in]	offsets		offsets of mrec
+@param[in]	zip_size	compressed page size in bytes, or 0
+@param[in,out]	tuple		data tuple
+@param[in,out]	heap		memory heap */
+static
+void
+row_merge_copy_blobs(
+	const mrec_t*		mrec,
+	const rec_offs*		offsets,
+	ulint			zip_size,
+	dtuple_t*		tuple,
+	mem_heap_t*		heap)
+{
+	ut_ad(mrec == NULL || rec_offs_any_extern(offsets));
+
+	for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++) {
+		ulint		len;
+		const void*	data;
+		dfield_t*	field = dtuple_get_nth_field(tuple, i);
+		ulint		field_len;
+		const byte*	field_data;
+
+		if (!dfield_is_ext(field)) {
+			continue;
+		}
+
+		ut_ad(!dfield_is_null(field));
+
+		/* During the creation of a PRIMARY KEY, the table is
+		X-locked, and we skip copying records that have been
+		marked for deletion. Therefore, externally stored
+		columns cannot possibly be freed between the time the
+		BLOB pointers are read (row_merge_read_clustered_index())
+		and dereferenced (below). */
+		if (mrec == NULL) {
+			field_data
+				= static_cast<byte*>(dfield_get_data(field));
+			field_len = dfield_get_len(field);
+
+			ut_a(field_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+			ut_a(memcmp(field_data + field_len
+				     - BTR_EXTERN_FIELD_REF_SIZE,
+				     field_ref_zero,
+				     BTR_EXTERN_FIELD_REF_SIZE));
+
+			data = btr_copy_externally_stored_field(
+				&len, field_data, zip_size, field_len, heap);
+		} else {
+			data = btr_rec_copy_externally_stored_field(
+				mrec, offsets, zip_size, i, &len, heap);
+		}
+
+		/* Because we have locked the table, any records
+		written by incomplete transactions must have been
+		rolled back already. There must not be any incomplete
+		BLOB columns. */
+		ut_a(data);
+
+		dfield_set_data(field, data, len);
+	}
+}
+
+/** Convert a merge record to a typed data tuple. Note that externally
+stored fields are not copied to heap.
+@param[in,out]	index	index on the table
+@param[in]	mtuple	merge record
+@param[in]	heap	memory heap from which memory needed is allocated
+@return	index entry built. */
+static
+void
+row_merge_mtuple_to_dtuple(
+	dict_index_t*	index,
+	dtuple_t*	dtuple,
+	const mtuple_t* mtuple)
+{
+	ut_ad(!dict_index_is_ibuf(index));
+
+	memcpy(dtuple->fields, mtuple->fields,
+	       dtuple->n_fields * sizeof *mtuple->fields);
+}
+
+static	MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_merge_insert_index_tuples(
+	dict_index_t*		index,
+	const dict_table_t*	old_table,
+	const pfs_os_file_t&	fd,
+	row_merge_block_t*	block,
+	const row_merge_buf_t*	row_buf,
+	BtrBulk*		btr_bulk,
+	const ib_uint64_t	table_total_rows,
+	double			pct_progress,
+	double			pct_cost,
+	row_merge_block_t*	crypt_block,
+	ulint			space,
+	ut_stage_alter_t*	stage,
+	merge_file_t*		blob_file)
+{
+	const byte*		b;
+	mem_heap_t*		heap;
+	mem_heap_t*		tuple_heap;
+	dberr_t			error = DB_SUCCESS;
+	ulint			foffs = 0;
+	rec_offs*		offsets;
+	mrec_buf_t*		buf;
+	ulint			n_rows = 0;
+	dtuple_t*		dtuple;
+	ib_uint64_t		inserted_rows = 0;
+	double			curr_progress = 0;
+	dict_index_t*		old_index = NULL;
+	const mrec_t*		mrec  = NULL;
+	mtr_t			mtr;
+
+
+	DBUG_ENTER("row_merge_insert_index_tuples");
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(!(index->type & DICT_FTS));
+	ut_ad(!dict_index_is_spatial(index));
+
+	if (stage != NULL) {
+		stage->begin_phase_insert();
+	}
+
+	tuple_heap = mem_heap_create(1000);
+
+	{
+		ulint i	= 1 + REC_OFFS_HEADER_SIZE
+			+ dict_index_get_n_fields(index);
+		heap = mem_heap_create(sizeof *buf + i * sizeof *offsets);
+		offsets = static_cast<rec_offs*>(
+			mem_heap_alloc(heap, i * sizeof *offsets));
+		rec_offs_set_n_alloc(offsets, i);
+		rec_offs_set_n_fields(offsets, dict_index_get_n_fields(index));
+	}
+
+	if (row_buf != NULL) {
+		ut_ad(fd == OS_FILE_CLOSED);
+		ut_ad(block == NULL);
+		DBUG_EXECUTE_IF("row_merge_read_failure",
+				error = DB_CORRUPTION;
+				goto err_exit;);
+		buf = NULL;
+		b = NULL;
+		dtuple = dtuple_create(
+			heap, dict_index_get_n_fields(index));
+		dtuple_set_n_fields_cmp(
+			dtuple, dict_index_get_n_unique_in_tree(index));
+	} else {
+		b = block;
+		dtuple = NULL;
+
+		if (!row_merge_read(fd, foffs, block, crypt_block, space)) {
+			error = DB_CORRUPTION;
+			goto err_exit;
+		} else {
+			buf = static_cast<mrec_buf_t*>(
+				mem_heap_alloc(heap, sizeof *buf));
+		}
+	}
+
+	for (;;) {
+
+		if (stage != NULL) {
+			stage->inc();
+		}
+
+		if (row_buf != NULL) {
+			if (n_rows >= row_buf->n_tuples) {
+				break;
+			}
+
+			/* Convert merge tuple record from
+			row buffer to data tuple record */
+			row_merge_mtuple_to_dtuple(
+				index, dtuple, &row_buf->tuples[n_rows]);
+			n_rows++;
+			/* BLOB pointers must be copied from dtuple */
+			mrec = NULL;
+		} else {
+			b = row_merge_read_rec(block, buf, b, index,
+					       fd, &foffs, &mrec, offsets,
+					       crypt_block,
+					       space);
+
+			if (UNIV_UNLIKELY(!b)) {
+				/* End of list, or I/O error */
+				if (mrec) {
+					error = DB_CORRUPTION;
+				}
+				break;
+			}
+
+			dtuple = row_rec_to_index_entry_low(
+				mrec, index, offsets, tuple_heap);
+		}
+
+		old_index	= dict_table_get_first_index(old_table);
+
+		if (dict_index_is_clust(index)
+		    && dict_index_is_online_ddl(old_index)) {
+			error = row_log_table_get_error(old_index);
+			if (error != DB_SUCCESS) {
+				break;
+			}
+		}
+
+		ut_ad(!dtuple_get_n_ext(dtuple) || index->is_primary());
+
+		if (!dtuple_get_n_ext(dtuple)) {
+		} else if (blob_file) {
+			error = row_merge_copy_blob_from_file(
+				dtuple, tuple_heap, blob_file);
+			if (error != DB_SUCCESS) {
+				break;
+			}
+		} else {
+			/* Off-page columns can be fetched safely
+			when concurrent modifications to the table
+			are disabled. (Purge can process delete-marked
+			records, but row_merge_read_clustered_index()
+			would have skipped them.)
+
+			When concurrent modifications are enabled,
+			row_merge_read_clustered_index() will
+			only see rows from transactions that were
+			committed before the ALTER TABLE started
+			(REPEATABLE READ).
+
+			Any modifications after the
+			row_merge_read_clustered_index() scan
+			will go through row_log_table_apply(). */
+			row_merge_copy_blobs(
+				mrec, offsets,
+				old_table->space->zip_size(),
+				dtuple, tuple_heap);
+		}
+
+		ut_ad(dtuple_validate(dtuple));
+		error = btr_bulk->insert(dtuple);
+
+		if (error != DB_SUCCESS) {
+			goto err_exit;
+		}
+
+		mem_heap_empty(tuple_heap);
+
+		/* Increment innodb_onlineddl_pct_progress status variable */
+		inserted_rows++;
+		if(inserted_rows % 1000 == 0) {
+			/* Update progress for each 1000 rows */
+			curr_progress = (inserted_rows >= table_total_rows ||
+				table_total_rows <= 0) ?
+				pct_cost :
+				pct_cost * static_cast<double>(inserted_rows)
+				/ static_cast<double>(table_total_rows);
+
+			/* presenting 10.12% as 1012 integer */;
+			onlineddl_pct_progress = (ulint) ((pct_progress + curr_progress) * 100);
+		}
+	}
+
+err_exit:
+	mem_heap_free(tuple_heap);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(error);
+}
+
+/*********************************************************************//**
+Drop an index that was created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed. */
+static
+void
+row_merge_drop_index_dict(
+/*======================*/
+	trx_t*		trx,	/*!< in/out: dictionary transaction */
+	index_id_t	index_id)/*!< in: index identifier */
+{
+	static const char sql[] =
+		"PROCEDURE DROP_INDEX_PROC () IS\n"
+		"BEGIN\n"
+		"DELETE FROM SYS_FIELDS WHERE INDEX_ID=:indexid;\n"
+		"DELETE FROM SYS_INDEXES WHERE ID=:indexid;\n"
+		"END;\n";
+	dberr_t		error;
+	pars_info_t*	info;
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(trx->dict_operation);
+	ut_ad(dict_sys.locked());
+
+	info = pars_info_create();
+	pars_info_add_ull_literal(info, "indexid", index_id);
+	trx->op_info = "dropping index from dictionary";
+	error = que_eval_sql(info, sql, trx);
+
+	if (error != DB_SUCCESS) {
+		/* Even though we ensure that DDL transactions are WAIT
+		and DEADLOCK free, we could encounter other errors e.g.,
+		DB_TOO_MANY_CONCURRENT_TRXS. */
+		trx->error_state = DB_SUCCESS;
+
+		ib::error() << "row_merge_drop_index_dict failed with error "
+			<< error;
+	}
+
+	trx->op_info = "";
+}
+
+/*********************************************************************//**
+Drop indexes that were created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed. */
+static
+void
+row_merge_drop_indexes_dict(
+/*========================*/
+	trx_t*		trx,	/*!< in/out: dictionary transaction */
+	table_id_t	table_id)/*!< in: table identifier */
+{
+	static const char sql[] =
+		"PROCEDURE DROP_INDEXES_PROC () IS\n"
+		"ixid CHAR;\n"
+		"found INT;\n"
+
+		"DECLARE CURSOR index_cur IS\n"
+		" SELECT ID FROM SYS_INDEXES\n"
+		" WHERE TABLE_ID=:tableid AND\n"
+		" SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n"
+		"FOR UPDATE;\n"
+
+		"BEGIN\n"
+		"found := 1;\n"
+		"OPEN index_cur;\n"
+		"WHILE found = 1 LOOP\n"
+		"  FETCH index_cur INTO ixid;\n"
+		"  IF (SQL % NOTFOUND) THEN\n"
+		"    found := 0;\n"
+		"  ELSE\n"
+		"    DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n"
+		"    DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE index_cur;\n"
+
+		"END;\n";
+	dberr_t		error;
+	pars_info_t*	info;
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(trx->dict_operation);
+	ut_ad(dict_sys.locked());
+
+	/* It is possible that table->n_ref_count > 1 when
+	locked=TRUE. In this case, all code that should have an open
+	handle to the table be waiting for the next statement to execute,
+	or waiting for a meta-data lock.
+
+	A concurrent purge will be prevented by dict_sys.latch. */
+
+	info = pars_info_create();
+	pars_info_add_ull_literal(info, "tableid", table_id);
+	trx->op_info = "dropping indexes";
+	error = que_eval_sql(info, sql, trx);
+
+	switch (error) {
+	case DB_SUCCESS:
+		break;
+	default:
+		/* Even though we ensure that DDL transactions are WAIT
+		and DEADLOCK free, we could encounter other errors e.g.,
+		DB_TOO_MANY_CONCURRENT_TRXS. */
+		ib::error() << "row_merge_drop_indexes_dict failed with error "
+			<< error;
+		/* fall through */
+	case DB_TOO_MANY_CONCURRENT_TRXS:
+		trx->error_state = DB_SUCCESS;
+	}
+
+	trx->op_info = "";
+}
+
+/** Drop common internal tables if all fulltext indexes are dropped
+@param trx   transaction
+@param table user table */
+static void row_merge_drop_fulltext_indexes(trx_t *trx, dict_table_t *table)
+{
+  if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID) ||
+      !table->fts ||
+      !ib_vector_is_empty(table->fts->indexes))
+    return;
+
+  for (const dict_index_t *index= dict_table_get_first_index(table);
+       index; index= dict_table_get_next_index(index))
+    if (index->type & DICT_FTS)
+      return;
+
+  fts_optimize_remove_table(table);
+  fts_drop_tables(trx, *table);
+  table->fts->~fts_t();
+  table->fts= nullptr;
+  DICT_TF2_FLAG_UNSET(table, DICT_TF2_FTS);
+}
+
+/** Drop indexes that were created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed.
+@param trx              dictionary transaction
+@param table            table containing the indexes
+@param locked           True if table is locked,
+                        false - may need to do lazy drop
+@param alter_trx        Alter table transaction */
+void
+row_merge_drop_indexes(
+        trx_t*          trx,
+        dict_table_t*   table,
+        bool            locked,
+        const trx_t*    alter_trx)
+{
+	dict_index_t*	index;
+	dict_index_t*	next_index;
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(trx->dict_operation);
+	ut_ad(dict_sys.locked());
+
+	index = dict_table_get_first_index(table);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_COMPLETE);
+
+	/* the caller should have an open handle to the table */
+	ut_ad(table->get_ref_count() >= 1);
+
+	/* It is possible that table->n_ref_count > 1 when
+	locked=TRUE. In this case, all code that should have an open
+	handle to the table be waiting for the next statement to execute,
+	or waiting for a meta-data lock.
+
+	A concurrent purge will be prevented by MDL. */
+
+	if (!locked && (table->get_ref_count() > 1
+			|| table->has_lock_other_than(alter_trx))) {
+		while ((index = dict_table_get_next_index(index)) != NULL) {
+			ut_ad(!dict_index_is_clust(index));
+
+			switch (dict_index_get_online_status(index)) {
+			case ONLINE_INDEX_ABORTED_DROPPED:
+				continue;
+			case ONLINE_INDEX_COMPLETE:
+				if (index->is_committed()) {
+					/* Do nothing to already
+					published indexes. */
+				} else if (index->type & DICT_FTS) {
+					/* Drop a completed FULLTEXT
+					index, due to a timeout during
+					MDL upgrade for
+					commit_inplace_alter_table().
+					Because only concurrent reads
+					are allowed (and they are not
+					seeing this index yet) we
+					are safe to drop the index. */
+					dict_index_t* prev = UT_LIST_GET_PREV(
+						indexes, index);
+					/* At least there should be
+					the clustered index before
+					this one. */
+					ut_ad(prev);
+					ut_a(table->fts);
+					fts_drop_index(table, index, trx);
+					row_merge_drop_index_dict(
+						trx, index->id);
+					/* We can remove a DICT_FTS
+					index from the cache, because
+					we do not allow ADD FULLTEXT INDEX
+					with LOCK=NONE. If we allowed that,
+					we should exclude FTS entries from
+					prebuilt->ins_node->entry_list
+					in ins_node_create_entry_list(). */
+#ifdef BTR_CUR_HASH_ADAPT
+					ut_ad(!index->search_info->ref_count);
+#endif /* BTR_CUR_HASH_ADAPT */
+					dict_index_remove_from_cache(
+						table, index);
+					index = prev;
+				} else {
+					index->lock.x_lock(SRW_LOCK_CALL);
+					dict_index_set_online_status(
+						index, ONLINE_INDEX_ABORTED);
+					index->type |= DICT_CORRUPT;
+					table->drop_aborted = TRUE;
+					goto drop_aborted;
+				}
+				continue;
+			case ONLINE_INDEX_CREATION:
+				index->lock.x_lock(SRW_LOCK_CALL);
+				ut_ad(!index->is_committed());
+				row_log_abort_sec(index);
+			drop_aborted:
+				index->lock.x_unlock();
+
+				DEBUG_SYNC_C("merge_drop_index_after_abort");
+				/* covered by dict_sys.latch */
+				MONITOR_INC(MONITOR_BACKGROUND_DROP_INDEX);
+				/* fall through */
+			case ONLINE_INDEX_ABORTED:
+				/* Drop the index tree from the
+				data dictionary and free it from
+				the tablespace, but keep the object
+				in the data dictionary cache. */
+				row_merge_drop_index_dict(trx, index->id);
+				index->lock.x_lock(SRW_LOCK_CALL);
+				dict_index_set_online_status(
+					index, ONLINE_INDEX_ABORTED_DROPPED);
+				index->lock.x_unlock();
+				table->drop_aborted = TRUE;
+				continue;
+			}
+			ut_error;
+		}
+
+		row_merge_drop_fulltext_indexes(trx, table);
+		return;
+	}
+
+	row_merge_drop_indexes_dict(trx, table->id);
+
+	/* Invalidate all row_prebuilt_t::ins_graph that are referring
+	to this table. That is, force row_get_prebuilt_insert_row() to
+	rebuild prebuilt->ins_node->entry_list). */
+	if (table->def_trx_id < trx->id) {
+		table->def_trx_id = trx->id;
+	} else {
+		ut_ad(table->def_trx_id == trx->id || table->name.part());
+	}
+
+	next_index = dict_table_get_next_index(index);
+
+	while ((index = next_index) != NULL) {
+		/* read the next pointer before freeing the index */
+		next_index = dict_table_get_next_index(index);
+
+		ut_ad(!dict_index_is_clust(index));
+
+		if (!index->is_committed()) {
+			/* If it is FTS index, drop from table->fts
+			and also drop its auxiliary tables */
+			if (index->type & DICT_FTS) {
+				ut_a(table->fts);
+				fts_drop_index(table, index, trx);
+			}
+
+			switch (dict_index_get_online_status(index)) {
+			case ONLINE_INDEX_CREATION:
+				/* This state should only be possible
+				when prepare_inplace_alter_table() fails
+				after invoking row_merge_create_index().
+				In inplace_alter_table(),
+				row_merge_build_indexes()
+				should never leave the index in this state.
+				It would invoke row_log_abort_sec() on
+				failure. */
+			case ONLINE_INDEX_COMPLETE:
+				/* In these cases, we are able to drop
+				the index straight. The DROP INDEX was
+				never deferred. */
+				break;
+			case ONLINE_INDEX_ABORTED:
+			case ONLINE_INDEX_ABORTED_DROPPED:
+				/* covered by dict_sys.latch */
+				MONITOR_DEC(MONITOR_BACKGROUND_DROP_INDEX);
+			}
+
+			dict_index_remove_from_cache(table, index);
+		}
+	}
+
+	row_merge_drop_fulltext_indexes(trx, table);
+	table->drop_aborted = FALSE;
+	ut_d(dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE));
+}
+
+/** Drop fulltext indexes */
+static ibool row_merge_drop_fts(void *node, void *trx)
+{
+   auto s= static_cast<sel_node_t*>(node);
+
+   const dfield_t *table_id= que_node_get_val(s->select_list);
+   ut_ad(table_id->type.mtype == DATA_BINARY);
+   node= que_node_get_next(s->select_list);
+   ut_ad(!que_node_get_next(node));
+   const dfield_t *index_id= que_node_get_val(node);
+   ut_ad(index_id->type.mtype == DATA_BINARY);
+
+   static const char sql[]=
+     "PROCEDURE DROP_TABLES_PROC () IS\n"
+     "tid CHAR;\n"
+     "iid CHAR;\n"
+
+     "DECLARE CURSOR cur_tab IS\n"
+     "SELECT ID FROM SYS_TABLES\n"
+     "WHERE INSTR(NAME,:name)+45=LENGTH(NAME)"
+     " AND INSTR('123456',SUBSTR(NAME,LENGTH(NAME)-1,1))>0"
+     " FOR UPDATE;\n"
+
+     "DECLARE CURSOR cur_idx IS\n"
+     "SELECT ID FROM SYS_INDEXES\n"
+     "WHERE TABLE_ID = tid FOR UPDATE;\n"
+
+     "BEGIN\n"
+     "OPEN cur_tab;\n"
+     "WHILE 1 = 1 LOOP\n"
+     "  FETCH cur_tab INTO tid;\n"
+     "  IF (SQL % NOTFOUND) THEN EXIT; END IF;\n"
+     "  OPEN cur_idx;\n"
+     "  WHILE 1 = 1 LOOP\n"
+     "    FETCH cur_idx INTO iid;\n"
+     "    IF (SQL % NOTFOUND) THEN EXIT; END IF;\n"
+     "    DELETE FROM SYS_FIELDS WHERE INDEX_ID=iid;\n"
+     "    DELETE FROM SYS_INDEXES WHERE CURRENT OF cur_idx;\n"
+     "  END LOOP;\n"
+     "  CLOSE cur_idx;\n"
+     "  DELETE FROM SYS_COLUMNS WHERE TABLE_ID=tid;\n"
+     "  DELETE FROM SYS_TABLES WHERE CURRENT OF cur_tab;\n"
+     "END LOOP;\n"
+     "CLOSE cur_tab;\n"
+     "END;\n";
+
+   if (table_id->len == 8 && index_id->len == 8)
+   {
+     char buf[sizeof "/FTS_0000000000000000_0000000000000000_INDEX_"];
+     snprintf(buf, sizeof buf, "/FTS_%016llx_%016llx_INDEX_",
+              static_cast<ulonglong>
+              (mach_read_from_8(static_cast<const byte*>(table_id->data))),
+              static_cast<ulonglong>
+              (mach_read_from_8(static_cast<const byte*>(index_id->data))));
+     auto pinfo= pars_info_create();
+     pars_info_add_str_literal(pinfo, "name", buf);
+     que_eval_sql(pinfo, sql, static_cast<trx_t*>(trx));
+   }
+
+   return true;
+}
+
+/** During recovery, drop recovered index stubs that were created in
+prepare_inplace_alter_table_dict(). */
+void row_merge_drop_temp_indexes()
+{
+	static_assert(DICT_FTS == 32, "compatibility");
+
+	static const char sql[] =
+		"PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n"
+		"ixid CHAR;\n"
+		"found INT;\n"
+
+		"DECLARE FUNCTION drop_fts;\n"
+
+		"DECLARE CURSOR fts_cur IS\n"
+		" SELECT TABLE_ID,ID FROM SYS_INDEXES\n"
+		" WHERE TYPE=32"
+		" AND SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n"
+		" FOR UPDATE;\n"
+
+		"DECLARE CURSOR index_cur IS\n"
+		" SELECT ID FROM SYS_INDEXES\n"
+		" WHERE SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n"
+		"FOR UPDATE;\n"
+
+		"BEGIN\n"
+		"found := 1;\n"
+		"OPEN fts_cur;\n"
+		"WHILE found = 1 LOOP\n"
+		"  FETCH fts_cur INTO drop_fts();\n"
+		"  IF (SQL % NOTFOUND) THEN\n"
+		"    found := 0;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE fts_cur;\n"
+
+		"OPEN index_cur;\n"
+		"WHILE found = 1 LOOP\n"
+		"  FETCH index_cur INTO ixid;\n"
+		"  IF (SQL % NOTFOUND) THEN\n"
+		"    found := 0;\n"
+		"  ELSE\n"
+		"    DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n"
+		"    DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE index_cur;\n"
+		"END;\n";
+
+	/* Load the table definitions that contain partially defined
+	indexes, so that the data dictionary information can be checked
+	when accessing the tablename.ibd files. */
+	trx_t* trx = trx_create();
+	trx_start_for_ddl(trx);
+	trx->op_info = "dropping partially created indexes";
+	dberr_t error = lock_sys_tables(trx);
+
+	row_mysql_lock_data_dictionary(trx);
+	/* Ensure that this transaction will be rolled back and locks
+	will be released, if the server gets killed before the commit
+	gets written to the redo log. */
+	trx->dict_operation = true;
+
+	trx->op_info = "dropping indexes";
+
+	pars_info_t* pinfo = pars_info_create();
+	pars_info_bind_function(pinfo, "drop_fts", row_merge_drop_fts, trx);
+	if (error == DB_SUCCESS) {
+		error = que_eval_sql(pinfo, sql, trx);
+	}
+
+	if (error) {
+		/* Even though we ensure that DDL transactions are WAIT
+		and DEADLOCK free, we could encounter other errors e.g.,
+		DB_TOO_MANY_CONCURRENT_TRXS. */
+		trx->error_state = DB_SUCCESS;
+
+		ib::error() << "row_merge_drop_temp_indexes(): " << error;
+	}
+
+	trx_commit_for_mysql(trx);
+	row_mysql_unlock_data_dictionary(trx);
+	trx->free();
+}
+
+
+/** Create temporary merge files in the given paramater path, and if
+UNIV_PFS_IO defined, register the file descriptor with Performance Schema.
+@param[in]	path	location for creating temporary merge files, or NULL
+@return File descriptor */
+pfs_os_file_t
+row_merge_file_create_low(
+	const char*	path)
+{
+	if (!path) {
+		path = mysql_tmpdir;
+	}
+#ifdef UNIV_PFS_IO
+	/* This temp file open does not go through normal
+	file APIs, add instrumentation to register with
+	performance schema */
+	struct PSI_file_locker*	locker;
+	PSI_file_locker_state	state;
+	static const char label[] = "/Innodb Merge Temp File";
+	char* name = static_cast<char*>(
+		ut_malloc_nokey(strlen(path) + sizeof label));
+	strcpy(name, path);
+	strcat(name, label);
+
+	register_pfs_file_open_begin(
+		&state, locker, innodb_temp_file_key,
+		PSI_FILE_CREATE, path ? name : label, __FILE__, __LINE__);
+
+#endif
+	DBUG_ASSERT(strlen(path) + 2 <= FN_REFLEN);
+	char filename[FN_REFLEN];
+	File f = create_temp_file(filename, path, "ib",
+				  O_BINARY | O_SEQUENTIAL,
+				  MYF(MY_WME | MY_TEMPORARY));
+	pfs_os_file_t fd = IF_WIN((os_file_t)my_get_osfhandle(f), f);
+
+#ifdef UNIV_PFS_IO
+	register_pfs_file_open_end(locker, fd, 
+		(fd == OS_FILE_CLOSED)?NULL:&fd);
+	ut_free(name);
+#endif
+
+	if (fd == OS_FILE_CLOSED) {
+		ib::error() << "Cannot create temporary merge file";
+	}
+	return(fd);
+}
+
+
+/** Create a merge file in the given location.
+@param[out]	merge_file	merge file structure
+@param[in]	path		location for creating temporary file, or NULL
+@return file descriptor, or OS_FILE_CLOSED on error */
+pfs_os_file_t
+row_merge_file_create(
+	merge_file_t*	merge_file,
+	const char*	path)
+{
+	merge_file->fd = row_merge_file_create_low(path);
+	merge_file->offset = 0;
+	merge_file->n_rec = 0;
+
+	if (merge_file->fd != OS_FILE_CLOSED) {
+		if (srv_disable_sort_file_cache) {
+			os_file_set_nocache(merge_file->fd,
+				"row0merge.cc", "sort");
+		}
+	}
+	return(merge_file->fd);
+}
+
+/*********************************************************************//**
+Destroy a merge file. And de-register the file from Performance Schema
+if UNIV_PFS_IO is defined. */
+void
+row_merge_file_destroy_low(
+/*=======================*/
+	const pfs_os_file_t& fd)	/*!< in: merge file descriptor */
+{
+	if (fd != OS_FILE_CLOSED) {
+		int res = mysql_file_close(IF_WIN(my_win_handle2File((os_file_t)fd), fd),
+					   MYF(MY_WME));
+		ut_a(res != -1);
+	}
+}
+/*********************************************************************//**
+Destroy a merge file. */
+void
+row_merge_file_destroy(
+/*===================*/
+	merge_file_t*	merge_file)	/*!< in/out: merge file structure */
+{
+	ut_ad(!srv_read_only_mode);
+
+	if (merge_file->fd != OS_FILE_CLOSED) {
+		row_merge_file_destroy_low(merge_file->fd);
+		merge_file->fd = OS_FILE_CLOSED;
+	}
+}
+
+/*********************************************************************//**
+Rename an index in the dictionary that was created. The data
+dictionary must have been locked exclusively by the caller, because
+the transaction will not be committed.
+@return DB_SUCCESS if all OK */
+dberr_t
+row_merge_rename_index_to_add(
+/*==========================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	table_id_t	table_id,	/*!< in: table identifier */
+	index_id_t	index_id)	/*!< in: index identifier */
+{
+	dberr_t		err = DB_SUCCESS;
+	pars_info_t*	info = pars_info_create();
+
+	/* We use the private SQL parser of Innobase to generate the
+	query graphs needed in renaming indexes. */
+
+	static const char rename_index[] =
+		"PROCEDURE RENAME_INDEX_PROC () IS\n"
+		"BEGIN\n"
+		"UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n"
+		"WHERE TABLE_ID = :tableid AND ID = :indexid;\n"
+		"END;\n";
+
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(trx->dict_operation);
+
+	trx->op_info = "renaming index to add";
+
+	pars_info_add_ull_literal(info, "tableid", table_id);
+	pars_info_add_ull_literal(info, "indexid", index_id);
+
+	err = que_eval_sql(info, rename_index, trx);
+
+	if (err != DB_SUCCESS) {
+		/* Even though we ensure that DDL transactions are WAIT
+		and DEADLOCK free, we could encounter other errors e.g.,
+		DB_TOO_MANY_CONCURRENT_TRXS. */
+		trx->error_state = DB_SUCCESS;
+
+		ib::error() << "row_merge_rename_index_to_add failed with"
+			" error " << err;
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/** Create the index and load in to the dictionary.
+@param[in,out]	table		the index is on this table
+@param[in]	index_def	the index definition
+@param[in]	add_v		new virtual columns added along with add
+				index call
+@return index, or NULL on error */
+dict_index_t*
+row_merge_create_index(
+	dict_table_t*		table,
+	const index_def_t*	index_def,
+	const dict_add_v_col_t*	add_v)
+{
+	dict_index_t*	index;
+	ulint		n_fields = index_def->n_fields;
+	ulint		i;
+	ulint		n_add_vcol = 0;
+
+	DBUG_ENTER("row_merge_create_index");
+
+	ut_ad(!srv_read_only_mode);
+
+	/* Create the index prototype, using the passed in def, this is not
+	a persistent operation. We pass 0 as the space id, and determine at
+	a lower level the space id where to store the table. */
+
+	index = dict_mem_index_create(table, index_def->name,
+				      index_def->ind_type, n_fields);
+	index->set_committed(index_def->rebuild);
+
+	for (i = 0; i < n_fields; i++) {
+		const char*	name;
+		index_field_t*	ifield = &index_def->fields[i];
+
+		if (ifield->is_v_col) {
+			if (ifield->col_no >= table->n_v_def) {
+				ut_ad(ifield->col_no < table->n_v_def
+				      + add_v->n_v_col);
+				ut_ad(ifield->col_no >= table->n_v_def);
+				name = add_v->v_col_name[
+					ifield->col_no - table->n_v_def];
+				n_add_vcol++;
+			} else {
+				name = dict_table_get_v_col_name(
+					table, ifield->col_no);
+			}
+		} else {
+			name = dict_table_get_col_name(table, ifield->col_no);
+		}
+
+		dict_mem_index_add_field(index, name, ifield->prefix_len,
+					 ifield->descending);
+	}
+
+	if (n_add_vcol) {
+		index->assign_new_v_col(n_add_vcol);
+	}
+
+	DBUG_RETURN(index);
+}
+
+/*********************************************************************//**
+Check if a transaction can use an index. */
+bool
+row_merge_is_index_usable(
+/*======================*/
+	const trx_t*		trx,	/*!< in: transaction */
+	const dict_index_t*	index)	/*!< in: index to check */
+{
+	if (!index->is_primary()
+	    && dict_index_is_online_ddl(index)) {
+		/* Indexes that are being created are not useable. */
+		return(false);
+	}
+
+	return(!index->is_corrupted()
+	       && (index->table->is_temporary() || index->table->no_rollback()
+		   || index->trx_id == 0
+		   || !trx->read_view.is_open()
+		   || trx->read_view.changes_visible(index->trx_id)));
+}
+
+/** Build indexes on a table by reading a clustered index, creating a temporary
+file containing index entries, merge sorting these index entries and inserting
+sorted index entries to indexes.
+@param[in]	trx		transaction
+@param[in]	old_table	table where rows are read from
+@param[in]	new_table	table where indexes are created; identical to
+old_table unless creating a PRIMARY KEY
+@param[in]	online		true if creating indexes online
+@param[in]	indexes		indexes to be created
+@param[in]	key_numbers	MySQL key numbers
+@param[in]	n_indexes	size of indexes[]
+@param[in,out]	table		MySQL table, for reporting erroneous key value
+if applicable
+@param[in]	defaults	default values of added, changed columns, or NULL
+@param[in]	col_map		mapping of old column numbers to new ones, or
+NULL if old_table == new_table
+@param[in]	add_autoinc	number of added AUTO_INCREMENT columns, or
+ULINT_UNDEFINED if none is added
+@param[in,out]	sequence	autoinc sequence
+@param[in]	skip_pk_sort	whether the new PRIMARY KEY will follow
+existing order
+@param[in,out]	stage		performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_read_pk() will be called at the beginning of
+this function and it will be passed to other functions for further accounting.
+@param[in]	add_v		new virtual columns added along with indexes
+@param[in]	eval_table	mysql table used to evaluate virtual column
+				value, see innobase_get_computed_value().
+@param[in]	allow_not_null	allow the conversion from null to not-null
+@param[in]	col_collate	columns whose collations changed, or nullptr
+@return DB_SUCCESS or error code */
+dberr_t
+row_merge_build_indexes(
+	trx_t*			trx,
+	dict_table_t*		old_table,
+	dict_table_t*		new_table,
+	bool			online,
+	dict_index_t**		indexes,
+	const ulint*		key_numbers,
+	ulint			n_indexes,
+	struct TABLE*		table,
+	const dtuple_t*		defaults,
+	const ulint*		col_map,
+	ulint			add_autoinc,
+	ib_sequence_t&		sequence,
+	bool			skip_pk_sort,
+	ut_stage_alter_t*	stage,
+	const dict_add_v_col_t*	add_v,
+	struct TABLE*		eval_table,
+	bool			allow_not_null,
+	const col_collations*	col_collate)
+{
+	merge_file_t*		merge_files;
+	row_merge_block_t*	block;
+	ut_new_pfx_t		block_pfx;
+	size_t			block_size;
+	ut_new_pfx_t		crypt_pfx;
+	row_merge_block_t*	crypt_block = NULL;
+	ulint			i;
+	ulint			j;
+	dberr_t			error;
+	pfs_os_file_t		tmpfd = OS_FILE_CLOSED;
+	dict_index_t*		fts_sort_idx = NULL;
+	fts_psort_t*		psort_info = NULL;
+	fts_psort_t*		merge_info = NULL;
+	bool			fts_psort_initiated = false;
+
+	double total_static_cost = 0;
+	double total_dynamic_cost = 0;
+	ulint total_index_blocks = 0;
+	double pct_cost=0;
+	double pct_progress=0;
+
+	DBUG_ENTER("row_merge_build_indexes");
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad((old_table == new_table) == !col_map);
+	ut_ad(!defaults || col_map);
+
+	stage->begin_phase_read_pk(skip_pk_sort && new_table != old_table
+				   ? n_indexes - 1
+				   : n_indexes);
+
+	/* Allocate memory for merge file data structure and initialize
+	fields */
+
+	ut_allocator<row_merge_block_t>	alloc(mem_key_row_merge_sort);
+
+	/* This will allocate "3 * srv_sort_buf_size" elements of type
+	row_merge_block_t. The latter is defined as byte. */
+	block_size = 3 * srv_sort_buf_size;
+	block = alloc.allocate_large(block_size, &block_pfx);
+
+	if (block == NULL) {
+		DBUG_RETURN(DB_OUT_OF_MEMORY);
+	}
+
+	crypt_pfx.m_size = 0; /* silence bogus -Wmaybe-uninitialized */
+	TRASH_ALLOC(&crypt_pfx, sizeof crypt_pfx);
+
+	if (srv_encrypt_log) {
+		crypt_block = static_cast<row_merge_block_t*>(
+			alloc.allocate_large(block_size,
+					     &crypt_pfx));
+
+		if (crypt_block == NULL) {
+			DBUG_RETURN(DB_OUT_OF_MEMORY);
+		}
+	}
+
+	trx_start_if_not_started_xa(trx, true);
+	ulint	n_merge_files = 0;
+
+	for (ulint i = 0; i < n_indexes; i++)
+	{
+		if (!dict_index_is_spatial(indexes[i])) {
+			n_merge_files++;
+		}
+	}
+
+	merge_files = static_cast<merge_file_t*>(
+		ut_malloc_nokey(n_merge_files * sizeof *merge_files));
+
+	/* Initialize all the merge file descriptors, so that we
+	don't call row_merge_file_destroy() on uninitialized
+	merge file descriptor */
+
+	for (i = 0; i < n_merge_files; i++) {
+		merge_files[i].fd = OS_FILE_CLOSED;
+		merge_files[i].offset = 0;
+		merge_files[i].n_rec = 0;
+	}
+
+	total_static_cost = COST_BUILD_INDEX_STATIC
+		* static_cast<double>(n_indexes) + COST_READ_CLUSTERED_INDEX;
+	total_dynamic_cost = COST_BUILD_INDEX_DYNAMIC
+		* static_cast<double>(n_indexes);
+	for (i = 0; i < n_indexes; i++) {
+		if (indexes[i]->type & DICT_FTS) {
+			ibool	opt_doc_id_size = FALSE;
+
+			/* To build FTS index, we would need to extract
+			doc's word, Doc ID, and word's position, so
+			we need to build a "fts sort index" indexing
+			on above three 'fields' */
+			fts_sort_idx = row_merge_create_fts_sort_index(
+				indexes[i], old_table, &opt_doc_id_size);
+
+			row_merge_dup_t*	dup
+				= static_cast<row_merge_dup_t*>(
+					ut_malloc_nokey(sizeof *dup));
+			dup->index = fts_sort_idx;
+			dup->table = table;
+			dup->col_map = col_map;
+			dup->n_dup = 0;
+
+			/* This can fail e.g. if temporal files can't be
+			created */
+			if (!row_fts_psort_info_init(
+					trx, dup, new_table, opt_doc_id_size,
+					old_table->space->zip_size(),
+					&psort_info, &merge_info)) {
+				error = DB_CORRUPTION;
+				goto func_exit;
+			}
+
+			/* We need to ensure that we free the resources
+			allocated */
+			fts_psort_initiated = true;
+		}
+	}
+
+	if (global_system_variables.log_warnings > 2) {
+		sql_print_information("InnoDB: Online DDL : Start reading"
+				      " clustered index of the table"
+				      " and create temporary files");
+	}
+
+	pct_cost = COST_READ_CLUSTERED_INDEX * 100 / (total_static_cost + total_dynamic_cost);
+
+	/* Do not continue if we can't encrypt table pages */
+	if (!old_table->is_readable() ||
+	    !new_table->is_readable()) {
+		error = DB_DECRYPTION_FAILED;
+		ib_push_warning(trx->mysql_thd, DB_DECRYPTION_FAILED,
+			"Table %s is encrypted but encryption service or"
+			" used key_id is not available. "
+			" Can't continue reading table.",
+			!old_table->is_readable() ? old_table->name.m_name :
+				new_table->name.m_name);
+		goto func_exit;
+	}
+
+	/* Read clustered index of the table and create files for
+	secondary index entries for merge sort */
+	error = row_merge_read_clustered_index(
+		trx, table, old_table, new_table, online, indexes,
+		fts_sort_idx, psort_info, merge_files, key_numbers,
+		n_indexes, defaults, add_v, col_map, add_autoinc,
+		sequence, block, skip_pk_sort, &tmpfd, stage,
+		pct_cost, crypt_block, eval_table, allow_not_null,
+		col_collate);
+
+	stage->end_phase_read_pk();
+
+	pct_progress += pct_cost;
+
+	if (global_system_variables.log_warnings > 2) {
+		sql_print_information("InnoDB: Online DDL : End of reading "
+				      "clustered index of the table"
+				      " and create temporary files");
+	}
+
+	for (i = 0; i < n_merge_files; i++) {
+		total_index_blocks += merge_files[i].offset;
+	}
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	DEBUG_SYNC_C("row_merge_after_scan");
+
+	/* Now we have files containing index entries ready for
+	sorting and inserting. */
+
+	for (ulint k = 0, i = 0; i < n_indexes; i++) {
+		dict_index_t*	sort_idx = indexes[i];
+
+		if (dict_index_is_spatial(sort_idx)) {
+			continue;
+		}
+
+		if (indexes[i]->type & DICT_FTS) {
+
+			sort_idx = fts_sort_idx;
+
+			if (FTS_PLL_MERGE) {
+				row_fts_start_parallel_merge(merge_info);
+				for (j = 0; j < FTS_NUM_AUX_INDEX; j++) {
+					merge_info[j].task->wait();
+					delete merge_info[j].task;
+				}
+			} else {
+				/* This cannot report duplicates; an
+				assertion would fail in that case. */
+				error = row_fts_merge_insert(
+					sort_idx, new_table,
+					psort_info, 0);
+			}
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+			DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Insert\n");
+#endif
+		} else if (merge_files[k].fd != OS_FILE_CLOSED) {
+			char	buf[NAME_LEN + 1];
+			row_merge_dup_t	dup = {
+				sort_idx, table, col_map, 0};
+
+			pct_cost = (COST_BUILD_INDEX_STATIC +
+				    (total_dynamic_cost
+				     * static_cast<double>(merge_files[k].offset)
+				     / static_cast<double>(total_index_blocks)))
+				/ (total_static_cost + total_dynamic_cost)
+				* PCT_COST_MERGESORT_INDEX * 100;
+			char*	bufend = innobase_convert_name(
+				buf, sizeof buf,
+				indexes[i]->name,
+				strlen(indexes[i]->name),
+				trx->mysql_thd);
+			buf[bufend - buf]='\0';
+
+			if (global_system_variables.log_warnings > 2) {
+				sql_print_information("InnoDB: Online DDL :"
+						      " Start merge-sorting"
+						      " index %s"
+						      " (" ULINTPF
+						      " / " ULINTPF "),"
+						      " estimated cost :"
+						      " %2.4f",
+						      buf, i + 1, n_indexes,
+						      pct_cost);
+			}
+
+			error = row_merge_sort(
+					trx, &dup, &merge_files[k],
+					block, &tmpfd, true,
+					pct_progress, pct_cost,
+					crypt_block, new_table->space_id,
+					stage);
+
+			pct_progress += pct_cost;
+
+			if (global_system_variables.log_warnings > 2) {
+				sql_print_information("InnoDB: Online DDL :"
+						      " End of "
+						      " merge-sorting index %s"
+						      " (" ULINTPF
+						      " / " ULINTPF ")",
+						      buf, i + 1, n_indexes);
+			}
+
+			if (error == DB_SUCCESS) {
+				BtrBulk	btr_bulk(sort_idx, trx);
+
+				pct_cost = (COST_BUILD_INDEX_STATIC +
+					    (total_dynamic_cost
+					     * static_cast<double>(
+						     merge_files[k].offset)
+					     / static_cast<double>(
+						     total_index_blocks)))
+					/ (total_static_cost
+					   + total_dynamic_cost)
+					* PCT_COST_INSERT_INDEX * 100;
+
+				if (global_system_variables.log_warnings > 2) {
+					sql_print_information(
+						"InnoDB: Online DDL : Start "
+						"building index %s"
+						" (" ULINTPF
+						" / " ULINTPF "), estimated "
+						"cost : %2.4f", buf, i + 1,
+						n_indexes, pct_cost);
+				}
+
+				error = row_merge_insert_index_tuples(
+					sort_idx, old_table,
+					merge_files[k].fd, block, NULL,
+					&btr_bulk,
+					merge_files[k].n_rec, pct_progress, pct_cost,
+					crypt_block, new_table->space_id,
+					stage);
+
+				error = btr_bulk.finish(error);
+
+				pct_progress += pct_cost;
+
+				if (global_system_variables.log_warnings > 2) {
+					sql_print_information(
+						"InnoDB: Online DDL : "
+						"End of building index %s"
+						" (" ULINTPF " / " ULINTPF ")",
+						buf, i + 1, n_indexes);
+				}
+			}
+		}
+
+		/* Close the temporary file to free up space. */
+		row_merge_file_destroy(&merge_files[k++]);
+
+		if (indexes[i]->type & DICT_FTS) {
+			row_fts_psort_info_destroy(psort_info, merge_info);
+			fts_psort_initiated = false;
+		} else if (old_table != new_table) {
+			ut_ad(!sort_idx->online_log);
+			ut_ad(sort_idx->online_status
+			      == ONLINE_INDEX_COMPLETE);
+		}
+
+		if (old_table != new_table
+		    || (indexes[i]->type & (DICT_FTS | DICT_SPATIAL))
+		    || error != DB_SUCCESS || !online) {
+			/* Do not apply any online log. */
+		} else {
+			if (global_system_variables.log_warnings > 2) {
+				sql_print_information(
+					"InnoDB: Online DDL : Applying"
+					" log to index");
+			}
+
+			DEBUG_SYNC_C("row_log_apply_before");
+			error = row_log_apply(trx, sort_idx, table, stage);
+			DEBUG_SYNC_C("row_log_apply_after");
+		}
+
+		if (error != DB_SUCCESS) {
+			trx->error_key_num = key_numbers[i];
+			goto func_exit;
+		}
+
+		if (indexes[i]->type & DICT_FTS
+		    && UNIV_UNLIKELY(fts_enable_diag_print)) {
+			ib::info() << "Finished building full-text index "
+				<< indexes[i]->name;
+		}
+	}
+
+func_exit:
+
+	DBUG_EXECUTE_IF(
+		"ib_build_indexes_too_many_concurrent_trxs",
+		error = DB_TOO_MANY_CONCURRENT_TRXS;
+		trx->error_state = error;);
+
+	if (fts_psort_initiated) {
+		/* Clean up FTS psort related resource */
+		row_fts_psort_info_destroy(psort_info, merge_info);
+		fts_psort_initiated = false;
+	}
+
+	row_merge_file_destroy_low(tmpfd);
+
+	for (i = 0; i < n_merge_files; i++) {
+		row_merge_file_destroy(&merge_files[i]);
+	}
+
+	if (fts_sort_idx) {
+		dict_mem_index_free(fts_sort_idx);
+	}
+
+	ut_free(merge_files);
+
+	alloc.deallocate_large(block, &block_pfx);
+
+	if (crypt_block) {
+		alloc.deallocate_large(crypt_block, &crypt_pfx);
+	}
+
+	DICT_TF2_FLAG_UNSET(new_table, DICT_TF2_FTS_ADD_DOC_ID);
+
+	if (online && old_table == new_table && error != DB_SUCCESS) {
+		/* On error, flag all online secondary index creation
+		as aborted. */
+		for (i = 0; i < n_indexes; i++) {
+			ut_ad(!(indexes[i]->type & DICT_FTS));
+			ut_ad(!indexes[i]->is_committed());
+			ut_ad(!dict_index_is_clust(indexes[i]));
+
+			/* Completed indexes should be dropped as
+			well, and indexes whose creation was aborted
+			should be dropped from the persistent
+			storage. However, at this point we can only
+			set some flags in the not-yet-published
+			indexes. These indexes will be dropped later
+			in row_merge_drop_indexes(), called by
+			rollback_inplace_alter_table(). */
+
+			switch (dict_index_get_online_status(indexes[i])) {
+			case ONLINE_INDEX_COMPLETE:
+				break;
+			case ONLINE_INDEX_CREATION:
+				indexes[i]->lock.x_lock(SRW_LOCK_CALL);
+				row_log_abort_sec(indexes[i]);
+				indexes[i]->type |= DICT_CORRUPT;
+				indexes[i]->lock.x_unlock();
+				new_table->drop_aborted = TRUE;
+				/* fall through */
+			case ONLINE_INDEX_ABORTED_DROPPED:
+			case ONLINE_INDEX_ABORTED:
+				MONITOR_ATOMIC_INC(
+					MONITOR_BACKGROUND_DROP_INDEX);
+			}
+		}
+
+		dict_index_t *clust_index= new_table->indexes.start;
+		clust_index->lock.x_lock(SRW_LOCK_CALL);
+		ut_ad(!clust_index->online_log ||
+		      clust_index->online_log_is_dummy());
+		clust_index->online_log= nullptr;
+		clust_index->lock.x_unlock();
+	}
+
+	DBUG_RETURN(error);
+}
+
+dberr_t row_merge_bulk_t::alloc_block()
+{
+  if (m_block)
+    return DB_SUCCESS;
+  m_block= m_alloc.allocate_large_dontdump(
+             3 * srv_sort_buf_size, &m_block_pfx);
+  if (m_block == nullptr)
+    return DB_OUT_OF_MEMORY;
+
+  m_crypt_pfx.m_size= 0;
+  TRASH_ALLOC(&m_crypt_pfx, sizeof m_crypt_pfx);
+  if (srv_encrypt_log)
+  {
+    m_crypt_block= static_cast<row_merge_block_t*>(
+       m_alloc.allocate_large(3 * srv_sort_buf_size, &m_crypt_pfx));
+    if (!m_crypt_block)
+      return DB_OUT_OF_MEMORY;
+  }
+  return DB_SUCCESS;
+}
+
+row_merge_bulk_t::row_merge_bulk_t(dict_table_t *table)
+{
+  ulint n_index= 0;
+  for (dict_index_t *index= UT_LIST_GET_FIRST(table->indexes);
+       index; index= UT_LIST_GET_NEXT(indexes, index))
+  {
+    if (!index->is_btree())
+      continue;
+    n_index++;
+  }
+
+  m_merge_buf= static_cast<row_merge_buf_t*>(
+    ut_zalloc_nokey(n_index * sizeof *m_merge_buf));
+
+  ulint i= 0;
+  for (dict_index_t *index= UT_LIST_GET_FIRST(table->indexes);
+       index; index= UT_LIST_GET_NEXT(indexes, index))
+  {
+    if (!index->is_btree())
+      continue;
+
+    mem_heap_t *heap= mem_heap_create(100);
+    row_merge_buf_create_low(&m_merge_buf[i], heap, index);
+    i++;
+  }
+
+  m_tmpfd= OS_FILE_CLOSED;
+  m_blob_file.fd= OS_FILE_CLOSED;
+  m_blob_file.offset= 0;
+  m_blob_file.n_rec= 0;
+}
+
+row_merge_bulk_t::~row_merge_bulk_t()
+{
+  ulint i= 0;
+  dict_table_t *table= m_merge_buf[0].index->table;
+  for (dict_index_t *index= UT_LIST_GET_FIRST(table->indexes);
+       index; index= UT_LIST_GET_NEXT(indexes, index))
+  {
+    if (!index->is_btree())
+      continue;
+    row_merge_buf_free(&m_merge_buf[i]);
+    if (m_merge_files)
+      row_merge_file_destroy(&m_merge_files[i]);
+    i++;
+  }
+
+  row_merge_file_destroy_low(m_tmpfd);
+
+  row_merge_file_destroy(&m_blob_file);
+
+  ut_free(m_merge_buf);
+
+  ut_free(m_merge_files);
+
+  if (m_block)
+    m_alloc.deallocate_large(m_block, &m_block_pfx);
+
+  if (m_crypt_block)
+    m_alloc.deallocate_large(m_crypt_block, &m_crypt_pfx);
+}
+
+void row_merge_bulk_t::init_tmp_file()
+{
+  if (m_merge_files)
+    return;
+
+  ulint n_index= 0;
+  dict_table_t *table= m_merge_buf[0].index->table;
+  for (dict_index_t *index= UT_LIST_GET_FIRST(table->indexes);
+       index; index= UT_LIST_GET_NEXT(indexes, index))
+  {
+    if (!index->is_btree())
+      continue;
+    n_index++;
+  }
+
+  m_merge_files= static_cast<merge_file_t*>(
+    ut_malloc_nokey(n_index * sizeof *m_merge_files));
+
+  for (ulint i= 0; i < n_index; i++)
+  {
+    m_merge_files[i].fd= OS_FILE_CLOSED;
+    m_merge_files[i].offset= 0;
+    m_merge_files[i].n_rec= 0;
+  }
+}
+
+void row_merge_bulk_t::clean_bulk_buffer(ulint index_no)
+{
+  mem_heap_empty(m_merge_buf[index_no].heap);
+  m_merge_buf[index_no].total_size = m_merge_buf[index_no].n_tuples = 0;
+}
+
+bool row_merge_bulk_t::create_tmp_file(ulint index_no)
+{
+  return row_merge_file_create_if_needed(
+            &m_merge_files[index_no], &m_tmpfd,
+            m_merge_buf[index_no].n_tuples, NULL);
+}
+
+dberr_t row_merge_bulk_t::write_to_tmp_file(ulint index_no)
+{
+  if (!create_tmp_file(index_no))
+    return DB_OUT_OF_MEMORY;
+  merge_file_t *file= &m_merge_files[index_no];
+  row_merge_buf_t *buf= &m_merge_buf[index_no];
+
+  alloc_block();
+
+  if (dberr_t err= row_merge_buf_write(buf,
+#ifndef DBUG_OFF
+                                       file,
+#endif
+                                       m_block,
+                                       index_no == 0 ? &m_blob_file : nullptr))
+    return err;
+
+  if (!row_merge_write(file->fd, file->offset++,
+                       m_block, m_crypt_block,
+                       buf->index->table->space->id))
+    return DB_TEMP_FILE_WRITE_FAIL;
+  MEM_UNDEFINED(&m_block[0], srv_sort_buf_size);
+  return DB_SUCCESS;
+}
+
+dberr_t row_merge_bulk_t::bulk_insert_buffered(const dtuple_t &row,
+                                               const dict_index_t &ind,
+                                               trx_t *trx)
+{
+  dberr_t err= DB_SUCCESS;
+  ulint i= 0;
+  mem_heap_t *large_tuple_heap= nullptr;
+  for (dict_index_t *index= UT_LIST_GET_FIRST(ind.table->indexes);
+       index; index= UT_LIST_GET_NEXT(indexes, index))
+  {
+    if (!index->is_btree())
+      continue;
+
+    if (index != &ind)
+    {
+      i++;
+      continue;
+    }
+    row_merge_buf_t *buf= &m_merge_buf[i];
+add_to_buf:
+    if (row_merge_bulk_buf_add(buf, *ind.table, row))
+    {
+      i++;
+      goto func_exit;
+    }
+
+    if (buf->n_tuples == 0)
+    {
+      /* Tuple data size is greater than srv_sort_buf_size */
+      dtuple_t *big_tuple= row_merge_buf_large_tuple(
+        row, &m_blob_file, &large_tuple_heap);
+      if (row_merge_bulk_buf_add(buf, *ind.table, *big_tuple))
+      {
+        i++;
+	goto func_exit;
+      }
+    }
+
+    if (index->is_unique())
+    {
+      row_merge_dup_t dup{index, nullptr, nullptr, 0};
+      row_merge_buf_sort(buf, &dup);
+      if (dup.n_dup)
+      {
+        trx->error_info= index;
+        err= DB_DUPLICATE_KEY;
+        goto func_exit;
+      }
+    }
+    else
+      row_merge_buf_sort(buf, NULL);
+    init_tmp_file();
+    merge_file_t *file= &m_merge_files[i];
+    file->n_rec+= buf->n_tuples;
+    err= write_to_tmp_file(i);
+    if (err != DB_SUCCESS)
+    {
+      trx->error_info= index;
+      goto func_exit;
+    }
+    clean_bulk_buffer(i);
+    buf= &m_merge_buf[i];
+    goto add_to_buf;
+  }
+
+func_exit:
+  if (large_tuple_heap)
+    mem_heap_free(large_tuple_heap);
+  return err;
+}
+
+dberr_t row_merge_bulk_t::write_to_index(ulint index_no, trx_t *trx)
+{
+  dberr_t err= DB_SUCCESS;
+  row_merge_buf_t buf= m_merge_buf[index_no];
+  merge_file_t *file= m_merge_files ?
+    &m_merge_files[index_no] : nullptr;
+  dict_index_t *index= buf.index;
+  dict_table_t *table= index->table;
+  BtrBulk btr_bulk(index, trx);
+  row_merge_dup_t dup = {index, nullptr, nullptr, 0};
+
+  if (buf.n_tuples)
+  {
+    if (dict_index_is_unique(index))
+    {
+      row_merge_buf_sort(&buf, &dup);
+      if (dup.n_dup)
+      {
+        err= DB_DUPLICATE_KEY;
+        goto func_exit;
+      }
+    }
+    else row_merge_buf_sort(&buf, NULL);
+    if (file && file->fd != OS_FILE_CLOSED)
+    {
+      file->n_rec+= buf.n_tuples;
+      err= write_to_tmp_file(index_no);
+      if (err!= DB_SUCCESS)
+        goto func_exit;
+    }
+    else
+    {
+      /* Data got fit in merge buffer. */
+      err= row_merge_insert_index_tuples(
+            index, table, OS_FILE_CLOSED, nullptr,
+            &buf, &btr_bulk, 0, 0, 0, nullptr, table->space_id, nullptr,
+            m_blob_file.fd == OS_FILE_CLOSED ? nullptr : &m_blob_file);
+      goto func_exit;
+    }
+  }
+
+  err= row_merge_sort(trx, &dup, file,
+                      m_block, &m_tmpfd, true, 0, 0,
+                      m_crypt_block, table->space_id, nullptr);
+  if (err != DB_SUCCESS)
+    goto func_exit;
+
+  err= row_merge_insert_index_tuples(
+        index, table, file->fd, m_block, nullptr,
+        &btr_bulk, 0, 0, 0, m_crypt_block, table->space_id,
+        nullptr, &m_blob_file);
+
+func_exit:
+  if (err != DB_SUCCESS)
+    trx->error_info= index;
+  else if (index->is_primary() && table->persistent_autoinc)
+    btr_write_autoinc(index, table->autoinc - 1);
+  err= btr_bulk.finish(err);
+  return err;
+}
+
+dberr_t row_merge_bulk_t::write_to_table(dict_table_t *table, trx_t *trx)
+{
+  ulint i= 0;
+  for (dict_index_t *index= UT_LIST_GET_FIRST(table->indexes);
+       index; index= UT_LIST_GET_NEXT(indexes, index))
+  {
+    if (!index->is_btree())
+      continue;
+
+    dberr_t err= write_to_index(i, trx);
+    if (err != DB_SUCCESS)
+      return err;
+    i++;
+  }
+
+  return DB_SUCCESS;
+}
+
+dberr_t trx_mod_table_time_t::write_bulk(dict_table_t *table, trx_t *trx)
+{
+  if (!bulk_store)
+    return DB_SUCCESS;
+  dberr_t err= bulk_store->write_to_table(table, trx);
+  delete bulk_store;
+  bulk_store= nullptr;
+  return err;
+}
+
+dberr_t trx_t::bulk_insert_apply_low()
+{
+  ut_ad(bulk_insert);
+  ut_ad(!check_unique_secondary);
+  ut_ad(!check_foreigns);
+  dberr_t err;
+  for (auto& t : mod_tables)
+    if (t.second.is_bulk_insert())
+      if ((err= t.second.write_bulk(t.first, this)) != DB_SUCCESS)
+        goto bulk_rollback;
+  return DB_SUCCESS;
+bulk_rollback:
+  undo_no_t low_limit= UINT64_MAX;
+  for (auto& t : mod_tables)
+  {
+    if (t.second.is_bulk_insert())
+    {
+      if (t.second.get_first() < low_limit)
+        low_limit= t.second.get_first();
+      delete t.second.bulk_store;
+      t.second.bulk_store= nullptr;
+    }
+  }
+  trx_savept_t bulk_save{low_limit};
+  rollback(&bulk_save);
+  return err;
+}
diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc
new file mode 100644
index 00000000..c5ee3be7
--- /dev/null
+++ b/storage/innobase/row/row0mysql.cc
@@ -0,0 +1,2916 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0mysql.cc
+Interface between Innobase row operations and MySQL.
+Contains also create table and other data dictionary operations.
+
+Created 9/17/2000 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+#include <debug_sync.h>
+#include <gstream.h>
+#include <spatial.h>
+
+#include "row0mysql.h"
+#include "buf0flu.h"
+#include "btr0sea.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "dict0dict.h"
+#include "dict0load.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "fsp0file.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "rem0cmp.h"
+#include "row0import.h"
+#include "row0ins.h"
+#include "row0row.h"
+#include "row0sel.h"
+#include "row0upd.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "srv0mon.h"
+#include "srv0start.h"
+#include "log.h"
+
+#include <algorithm>
+#include <vector>
+#include <thread>
+
+
+/** Delay an INSERT, DELETE or UPDATE operation if the purge is lagging. */
+static void row_mysql_delay_if_needed()
+{
+  const auto delay= srv_dml_needed_delay;
+  if (UNIV_UNLIKELY(delay != 0))
+  {
+    /* Adjust for purge_coordinator_state::refresh() */
+    log_sys.latch.rd_lock(SRW_LOCK_CALL);
+    const lsn_t last= log_sys.last_checkpoint_lsn,
+      max_age= log_sys.max_checkpoint_age;
+    log_sys.latch.rd_unlock();
+    const lsn_t lsn= log_sys.get_lsn();
+    if ((lsn - last) / 4 >= max_age / 5)
+      buf_flush_ahead(last + max_age / 5, false);
+    purge_sys.wake_if_not_active();
+    std::this_thread::sleep_for(std::chrono::microseconds(delay));
+  }
+}
+
+/*******************************************************************//**
+Frees the blob heap in prebuilt when no longer needed. */
+void
+row_mysql_prebuilt_free_blob_heap(
+/*==============================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct of a
+					ha_innobase:: table handle */
+{
+	DBUG_ENTER("row_mysql_prebuilt_free_blob_heap");
+
+	DBUG_PRINT("row_mysql_prebuilt_free_blob_heap",
+		   ("blob_heap freeing: %p", prebuilt->blob_heap));
+
+	mem_heap_free(prebuilt->blob_heap);
+	prebuilt->blob_heap = NULL;
+	DBUG_VOID_RETURN;
+}
+
+/*******************************************************************//**
+Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row
+format.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+byte*
+row_mysql_store_true_var_len(
+/*=========================*/
+	byte*	dest,	/*!< in: where to store */
+	ulint	len,	/*!< in: length, must fit in two bytes */
+	ulint	lenlen)	/*!< in: storage length of len: either 1 or 2 bytes */
+{
+	if (lenlen == 2) {
+		ut_a(len < 256 * 256);
+
+		mach_write_to_2_little_endian(dest, len);
+
+		return(dest + 2);
+	}
+
+	ut_a(lenlen == 1);
+	ut_a(len < 256);
+
+	mach_write_to_1(dest, len);
+
+	return(dest + 1);
+}
+
+/*******************************************************************//**
+Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and
+returns a pointer to the data.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+const byte*
+row_mysql_read_true_varchar(
+/*========================*/
+	ulint*		len,	/*!< out: variable-length field length */
+	const byte*	field,	/*!< in: field in the MySQL format */
+	ulint		lenlen)	/*!< in: storage length of len: either 1
+				or 2 bytes */
+{
+	if (lenlen == 2) {
+		*len = mach_read_from_2_little_endian(field);
+
+		return(field + 2);
+	}
+
+	ut_a(lenlen == 1);
+
+	*len = mach_read_from_1(field);
+
+	return(field + 1);
+}
+
+/*******************************************************************//**
+Stores a reference to a BLOB in the MySQL format. */
+void
+row_mysql_store_blob_ref(
+/*=====================*/
+	byte*		dest,	/*!< in: where to store */
+	ulint		col_len,/*!< in: dest buffer size: determines into
+				how many bytes the BLOB length is stored,
+				the space for the length may vary from 1
+				to 4 bytes */
+	const void*	data,	/*!< in: BLOB data; if the value to store
+				is SQL NULL this should be NULL pointer */
+	ulint		len)	/*!< in: BLOB length; if the value to store
+				is SQL NULL this should be 0; remember
+				also to set the NULL bit in the MySQL record
+				header! */
+{
+	/* MySQL might assume the field is set to zero except the length and
+	the pointer fields */
+
+	memset(dest, '\0', col_len);
+
+	/* In dest there are 1 - 4 bytes reserved for the BLOB length,
+	and after that 8 bytes reserved for the pointer to the data.
+	In 32-bit architectures we only use the first 4 bytes of the pointer
+	slot. */
+
+	ut_a(col_len - 8 > 1 || len < 256);
+	ut_a(col_len - 8 > 2 || len < 256 * 256);
+	ut_a(col_len - 8 > 3 || len < 256 * 256 * 256);
+
+	mach_write_to_n_little_endian(dest, col_len - 8, len);
+
+	memcpy(dest + col_len - 8, &data, sizeof data);
+}
+
+/*******************************************************************//**
+Reads a reference to a BLOB in the MySQL format.
+@return pointer to BLOB data */
+const byte*
+row_mysql_read_blob_ref(
+/*====================*/
+	ulint*		len,		/*!< out: BLOB length */
+	const byte*	ref,		/*!< in: BLOB reference in the
+					MySQL format */
+	ulint		col_len)	/*!< in: BLOB reference length
+					(not BLOB length) */
+{
+	byte*	data;
+
+	*len = mach_read_from_n_little_endian(ref, col_len - 8);
+
+	memcpy(&data, ref + col_len - 8, sizeof data);
+
+	return(data);
+}
+
+/*******************************************************************//**
+Converting InnoDB geometry data format to MySQL data format. */
+void
+row_mysql_store_geometry(
+/*=====================*/
+	byte*		dest,		/*!< in/out: where to store */
+	ulint		dest_len,	/*!< in: dest buffer size: determines
+					into how many bytes the GEOMETRY length
+					is stored, the space for the length
+					may vary from 1 to 4 bytes */
+	const byte*	src,		/*!< in: GEOMETRY data; if the value to
+					store is SQL NULL this should be NULL
+					pointer */
+	ulint		src_len)	/*!< in: GEOMETRY length; if the value
+					to store is SQL NULL this should be 0;
+					remember also to set the NULL bit in
+					the MySQL record header! */
+{
+	/* MySQL might assume the field is set to zero except the length and
+	the pointer fields */
+	MEM_CHECK_DEFINED(src, src_len);
+
+	memset(dest, '\0', dest_len);
+
+	/* In dest there are 1 - 4 bytes reserved for the BLOB length,
+	and after that 8 bytes reserved for the pointer to the data.
+	In 32-bit architectures we only use the first 4 bytes of the pointer
+	slot. */
+
+	ut_ad(dest_len - 8 > 1 || src_len < 1<<8);
+	ut_ad(dest_len - 8 > 2 || src_len < 1<<16);
+	ut_ad(dest_len - 8 > 3 || src_len < 1<<24);
+
+	mach_write_to_n_little_endian(dest, dest_len - 8, src_len);
+
+	memcpy(dest + dest_len - 8, &src, sizeof src);
+}
+
+/*******************************************************************//**
+Read geometry data in the MySQL format.
+@return pointer to geometry data */
+static
+const byte*
+row_mysql_read_geometry(
+/*====================*/
+	ulint*		len,		/*!< out: data length */
+	const byte*	ref,		/*!< in: geometry data in the
+					MySQL format */
+	ulint		col_len)	/*!< in: MySQL format length */
+{
+	byte*		data;
+	ut_ad(col_len > 8);
+
+	*len = mach_read_from_n_little_endian(ref, col_len - 8);
+
+	memcpy(&data, ref + col_len - 8, sizeof data);
+
+	return(data);
+}
+
+/**************************************************************//**
+Pad a column with spaces. */
+void
+row_mysql_pad_col(
+/*==============*/
+	ulint	mbminlen,	/*!< in: minimum size of a character,
+				in bytes */
+	byte*	pad,		/*!< out: padded buffer */
+	ulint	len)		/*!< in: number of bytes to pad */
+{
+	const byte*	pad_end;
+
+	switch (UNIV_EXPECT(mbminlen, 1)) {
+	default:
+		ut_error;
+	case 1:
+		/* space=0x20 */
+		memset(pad, 0x20, len);
+		break;
+	case 2:
+		/* space=0x0020 */
+		pad_end = pad + len;
+		ut_a(!(len % 2));
+		while (pad < pad_end) {
+			*pad++ = 0x00;
+			*pad++ = 0x20;
+		};
+		break;
+	case 4:
+		/* space=0x00000020 */
+		pad_end = pad + len;
+		ut_a(!(len % 4));
+		while (pad < pad_end) {
+			*pad++ = 0x00;
+			*pad++ = 0x00;
+			*pad++ = 0x00;
+			*pad++ = 0x20;
+		}
+		break;
+	}
+}
+
+/**************************************************************//**
+Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
+The counterpart of this function is row_sel_field_store_in_mysql_format() in
+row0sel.cc.
+@return up to which byte we used buf in the conversion */
+byte*
+row_mysql_store_col_in_innobase_format(
+/*===================================*/
+	dfield_t*	dfield,		/*!< in/out: dfield where dtype
+					information must be already set when
+					this function is called! */
+	byte*		buf,		/*!< in/out: buffer for a converted
+					integer value; this must be at least
+					col_len long then! NOTE that dfield
+					may also get a pointer to 'buf',
+					therefore do not discard this as long
+					as dfield is used! */
+	ibool		row_format_col,	/*!< TRUE if the mysql_data is from
+					a MySQL row, FALSE if from a MySQL
+					key value;
+					in MySQL, a true VARCHAR storage
+					format differs in a row and in a
+					key value: in a key value the length
+					is always stored in 2 bytes! */
+	const byte*	mysql_data,	/*!< in: MySQL column value, not
+					SQL NULL; NOTE that dfield may also
+					get a pointer to mysql_data,
+					therefore do not discard this as long
+					as dfield is used! */
+	ulint		col_len,	/*!< in: MySQL column length; NOTE that
+					this is the storage length of the
+					column in the MySQL format row, not
+					necessarily the length of the actual
+					payload data; if the column is a true
+					VARCHAR then this is irrelevant */
+	ulint		comp)		/*!< in: nonzero=compact format */
+{
+	const byte*	ptr	= mysql_data;
+	const dtype_t*	dtype;
+	ulint		type;
+	ulint		lenlen;
+
+	dtype = dfield_get_type(dfield);
+
+	type = dtype->mtype;
+
+	if (type == DATA_INT) {
+		/* Store integer data in Innobase in a big-endian format,
+		sign bit negated if the data is a signed integer. In MySQL,
+		integers are stored in a little-endian format. */
+
+		byte*	p = buf + col_len;
+
+		for (;;) {
+			p--;
+			*p = *mysql_data;
+			if (p == buf) {
+				break;
+			}
+			mysql_data++;
+		}
+
+		if (!(dtype->prtype & DATA_UNSIGNED)) {
+
+			*buf ^= 128;
+		}
+
+		ptr = buf;
+		buf += col_len;
+	} else if ((type == DATA_VARCHAR
+		    || type == DATA_VARMYSQL
+		    || type == DATA_BINARY)) {
+
+		if (dtype_get_mysql_type(dtype) == DATA_MYSQL_TRUE_VARCHAR) {
+			/* The length of the actual data is stored to 1 or 2
+			bytes at the start of the field */
+
+			if (row_format_col) {
+				if (dtype->prtype & DATA_LONG_TRUE_VARCHAR) {
+					lenlen = 2;
+				} else {
+					lenlen = 1;
+				}
+			} else {
+				/* In a MySQL key value, lenlen is always 2 */
+				lenlen = 2;
+			}
+
+			ptr = row_mysql_read_true_varchar(&col_len, mysql_data,
+							  lenlen);
+		} else {
+			/* Remove trailing spaces from old style VARCHAR
+			columns. */
+
+			/* Handle Unicode strings differently. */
+			ulint	mbminlen	= dtype_get_mbminlen(dtype);
+
+			ptr = mysql_data;
+
+			switch (mbminlen) {
+			default:
+				ut_error;
+			case 4:
+				/* space=0x00000020 */
+				/* Trim "half-chars", just in case. */
+				col_len &= ~3U;
+
+				while (col_len >= 4
+				       && ptr[col_len - 4] == 0x00
+				       && ptr[col_len - 3] == 0x00
+				       && ptr[col_len - 2] == 0x00
+				       && ptr[col_len - 1] == 0x20) {
+					col_len -= 4;
+				}
+				break;
+			case 2:
+				/* space=0x0020 */
+				/* Trim "half-chars", just in case. */
+				col_len &= ~1U;
+
+				while (col_len >= 2 && ptr[col_len - 2] == 0x00
+				       && ptr[col_len - 1] == 0x20) {
+					col_len -= 2;
+				}
+				break;
+			case 1:
+				/* space=0x20 */
+				while (col_len > 0
+				       && ptr[col_len - 1] == 0x20) {
+					col_len--;
+				}
+			}
+		}
+	} else if (comp && type == DATA_MYSQL
+		   && dtype_get_mbminlen(dtype) == 1
+		   && dtype_get_mbmaxlen(dtype) > 1) {
+		/* In some cases we strip trailing spaces from UTF-8 and other
+		multibyte charsets, from FIXED-length CHAR columns, to save
+		space. UTF-8 would otherwise normally use 3 * the string length
+		bytes to store an ASCII string! */
+
+		/* We assume that this CHAR field is encoded in a
+		variable-length character set where spaces have
+		1:1 correspondence to 0x20 bytes, such as UTF-8.
+
+		Consider a CHAR(n) field, a field of n characters.
+		It will contain between n * mbminlen and n * mbmaxlen bytes.
+		We will try to truncate it to n bytes by stripping
+		space padding.	If the field contains single-byte
+		characters only, it will be truncated to n characters.
+		Consider a CHAR(5) field containing the string
+		".a   " where "." denotes a 3-byte character represented
+		by the bytes "$%&". After our stripping, the string will
+		be stored as "$%&a " (5 bytes). The string
+		".abc " will be stored as "$%&abc" (6 bytes).
+
+		The space padding will be restored in row0sel.cc, function
+		row_sel_field_store_in_mysql_format(). */
+
+		ulint		n_chars;
+
+		ut_a(!(dtype_get_len(dtype) % dtype_get_mbmaxlen(dtype)));
+
+		n_chars = dtype_get_len(dtype) / dtype_get_mbmaxlen(dtype);
+
+		/* Strip space padding. */
+		while (col_len > n_chars && ptr[col_len - 1] == 0x20) {
+			col_len--;
+		}
+	} else if (!row_format_col) {
+		/* if mysql data is from a MySQL key value
+		since the length is always stored in 2 bytes,
+		we need do nothing here. */
+	} else if (type == DATA_BLOB) {
+
+		ptr = row_mysql_read_blob_ref(&col_len, mysql_data, col_len);
+	} else if (DATA_GEOMETRY_MTYPE(type)) {
+		ptr = row_mysql_read_geometry(&col_len, mysql_data, col_len);
+	}
+
+	dfield_set_data(dfield, ptr, col_len);
+
+	return(buf);
+}
+
+/**************************************************************//**
+Convert a row in the MySQL format to a row in the Innobase format. Note that
+the function to convert a MySQL format key value to an InnoDB dtuple is
+row_sel_convert_mysql_key_to_innobase() in row0sel.cc. */
+static
+void
+row_mysql_convert_row_to_innobase(
+/*==============================*/
+	dtuple_t*	row,		/*!< in/out: Innobase row where the
+					field type information is already
+					copied there! */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct where template
+					must be of type ROW_MYSQL_WHOLE_ROW */
+	const byte*	mysql_rec,	/*!< in: row in the MySQL format;
+					NOTE: do not discard as long as
+					row is used, as row may contain
+					pointers to this record! */
+	mem_heap_t**	blob_heap)	/*!< in: FIX_ME, remove this after
+					server fixes its issue */
+{
+	const mysql_row_templ_t*templ;
+	dfield_t*		dfield;
+	ulint			i;
+	ulint			n_col = 0;
+	ulint			n_v_col = 0;
+
+	ut_ad(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
+	ut_ad(prebuilt->mysql_template);
+
+	for (i = 0; i < prebuilt->n_template; i++) {
+
+		templ = prebuilt->mysql_template + i;
+
+		if (templ->is_virtual) {
+			ut_ad(n_v_col < dtuple_get_n_v_fields(row));
+			dfield = dtuple_get_nth_v_field(row, n_v_col);
+			n_v_col++;
+		} else {
+			dfield = dtuple_get_nth_field(row, n_col);
+			n_col++;
+		}
+
+		if (templ->mysql_null_bit_mask != 0) {
+			/* Column may be SQL NULL */
+
+			if (mysql_rec[templ->mysql_null_byte_offset]
+			    & (byte) (templ->mysql_null_bit_mask)) {
+
+				/* It is SQL NULL */
+
+				dfield_set_null(dfield);
+
+				goto next_column;
+			}
+		}
+
+		row_mysql_store_col_in_innobase_format(
+			dfield,
+			prebuilt->ins_upd_rec_buff + templ->mysql_col_offset,
+			TRUE, /* MySQL row format data */
+			mysql_rec + templ->mysql_col_offset,
+			templ->mysql_col_len,
+			dict_table_is_comp(prebuilt->table));
+
+		/* server has issue regarding handling BLOB virtual fields,
+		and we need to duplicate it with our own memory here */
+		if (templ->is_virtual
+		    && DATA_LARGE_MTYPE(dfield_get_type(dfield)->mtype)) {
+			if (*blob_heap == NULL) {
+				*blob_heap = mem_heap_create(dfield->len);
+			}
+			dfield_dup(dfield, *blob_heap);
+		}
+next_column:
+		;
+	}
+
+	/* If there is a FTS doc id column and it is not user supplied (
+	generated by server) then assign it a new doc id. */
+	if (!prebuilt->table->fts) {
+		return;
+	}
+
+	ut_a(prebuilt->table->fts->doc_col != ULINT_UNDEFINED);
+
+	doc_id_t	doc_id;
+
+	if (!DICT_TF2_FLAG_IS_SET(prebuilt->table, DICT_TF2_FTS_HAS_DOC_ID)) {
+		if (prebuilt->table->fts->cache->first_doc_id
+		    == FTS_NULL_DOC_ID) {
+			fts_get_next_doc_id(prebuilt->table, &doc_id);
+		}
+		return;
+	}
+
+	dfield_t*	fts_doc_id = dtuple_get_nth_field(
+		row, prebuilt->table->fts->doc_col);
+
+	if (fts_get_next_doc_id(prebuilt->table, &doc_id) == DB_SUCCESS) {
+		ut_a(doc_id != FTS_NULL_DOC_ID);
+		ut_ad(sizeof(doc_id) == fts_doc_id->type.len);
+		dfield_set_data(fts_doc_id, prebuilt->ins_upd_rec_buff
+				+ prebuilt->mysql_row_len, 8);
+		fts_write_doc_id(fts_doc_id->data, doc_id);
+	} else {
+		dfield_set_null(fts_doc_id);
+	}
+}
+
+/****************************************************************//**
+Handles user errors and lock waits detected by the database engine.
+@return true if it was a lock wait and we should continue running the
+query thread and in that case the thr is ALREADY in the running state. */
+bool
+row_mysql_handle_errors(
+/*====================*/
+	dberr_t*	new_err,/*!< out: possible new error encountered in
+				lock wait, or if no new error, the value
+				of trx->error_state at the entry of this
+				function */
+	trx_t*		trx,	/*!< in: transaction */
+	que_thr_t*	thr,	/*!< in: query thread, or NULL */
+	trx_savept_t*	savept)	/*!< in: savepoint, or NULL */
+{
+	dberr_t	err;
+
+	DBUG_ENTER("row_mysql_handle_errors");
+	DEBUG_SYNC_C("row_mysql_handle_errors");
+
+	err = trx->error_state;
+
+handle_new_error:
+	ut_a(err != DB_SUCCESS);
+
+	trx->error_state = DB_SUCCESS;
+
+	DBUG_LOG("trx", "handle error: " << err
+		 << ";id=" << ib::hex(trx->id) << ", " << trx);
+
+	switch (err) {
+	case DB_LOCK_WAIT_TIMEOUT:
+		extern my_bool innobase_rollback_on_timeout;
+		if (innobase_rollback_on_timeout) {
+			goto rollback;
+		}
+		/* fall through */
+	case DB_DUPLICATE_KEY:
+	case DB_FOREIGN_DUPLICATE_KEY:
+	case DB_TOO_BIG_RECORD:
+	case DB_UNDO_RECORD_TOO_BIG:
+	case DB_ROW_IS_REFERENCED:
+	case DB_NO_REFERENCED_ROW:
+	case DB_CANNOT_ADD_CONSTRAINT:
+	case DB_TOO_MANY_CONCURRENT_TRXS:
+	case DB_OUT_OF_FILE_SPACE:
+	case DB_READ_ONLY:
+	case DB_FTS_INVALID_DOCID:
+	case DB_INTERRUPTED:
+	case DB_CANT_CREATE_GEOMETRY_OBJECT:
+	case DB_TABLE_NOT_FOUND:
+	case DB_DECRYPTION_FAILED:
+	case DB_COMPUTE_VALUE_FAILED:
+	rollback_to_savept:
+		DBUG_EXECUTE_IF("row_mysql_crash_if_error", {
+					log_buffer_flush_to_disk();
+					DBUG_SUICIDE(); });
+		if (savept) {
+			/* Roll back the latest, possibly incomplete insertion
+			or update */
+
+			trx->rollback(savept);
+		}
+		if (!trx->bulk_insert) {
+			/* MariaDB will roll back the latest SQL statement */
+			break;
+		}
+		/* MariaDB will roll back the entire transaction. */
+		trx->bulk_insert = false;
+		trx->last_sql_stat_start.least_undo_no = 0;
+		trx->savepoints_discard();
+		break;
+	case DB_LOCK_WAIT:
+		err = lock_wait(thr);
+		if (err != DB_SUCCESS) {
+			goto handle_new_error;
+		}
+
+		*new_err = err;
+
+		DBUG_RETURN(true);
+
+	case DB_DEADLOCK:
+	case DB_LOCK_TABLE_FULL:
+	rollback:
+		/* Roll back the whole transaction; this resolution was added
+		to version 3.23.43 */
+
+		trx->rollback();
+		break;
+
+	case DB_IO_ERROR:
+	case DB_TABLE_CORRUPT:
+	case DB_CORRUPTION:
+	case DB_PAGE_CORRUPTED:
+		ib::error() << "We detected index corruption in an InnoDB type"
+			" table. You have to dump + drop + reimport the"
+			" table or, in a case of widespread corruption,"
+			" dump all InnoDB tables and recreate the whole"
+			" tablespace. If the mariadbd server crashes after"
+			" the startup or when you dump the tables. "
+			<< FORCE_RECOVERY_MSG;
+		goto rollback_to_savept;
+	case DB_FOREIGN_EXCEED_MAX_CASCADE:
+		ib::error() << "Cannot delete/update rows with cascading"
+			" foreign key constraints that exceed max depth of "
+			<< FK_MAX_CASCADE_DEL << ". Please drop excessive"
+			" foreign constraints and try again";
+		goto rollback_to_savept;
+	case DB_UNSUPPORTED:
+		ib::error() << "Cannot delete/update rows with cascading"
+			" foreign key constraints in timestamp-based temporal"
+			" table. Please drop excessive"
+			" foreign constraints and try again";
+		goto rollback_to_savept;
+	default:
+		ib::fatal() << "Unknown error " << err;
+	}
+
+	if (dberr_t n_err = trx->error_state) {
+		trx->error_state = DB_SUCCESS;
+		*new_err = n_err;
+	} else {
+		*new_err = err;
+	}
+
+	DBUG_RETURN(false);
+}
+
+/********************************************************************//**
+Create a prebuilt struct for a MySQL table handle.
+@return own: a prebuilt struct */
+row_prebuilt_t*
+row_create_prebuilt(
+/*================*/
+	dict_table_t*	table,		/*!< in: Innobase table handle */
+	ulint		mysql_row_len)	/*!< in: length in bytes of a row in
+					the MySQL format */
+{
+	DBUG_ENTER("row_create_prebuilt");
+
+	row_prebuilt_t*	prebuilt;
+	mem_heap_t*	heap;
+	dict_index_t*	clust_index;
+	dict_index_t*	temp_index;
+	dtuple_t*	ref;
+	ulint		ref_len;
+	uint		srch_key_len = 0;
+	ulint		search_tuple_n_fields;
+
+	search_tuple_n_fields = 2 * (dict_table_get_n_cols(table)
+				     + dict_table_get_n_v_cols(table));
+
+	clust_index = dict_table_get_first_index(table);
+
+	/* Make sure that search_tuple is long enough for clustered index */
+	ut_a(2 * unsigned(table->n_cols) >= unsigned(clust_index->n_fields)
+	     - clust_index->table->n_dropped());
+
+	ref_len = dict_index_get_n_unique(clust_index);
+
+
+        /* Maximum size of the buffer needed for conversion of INTs from
+	little endian format to big endian format in an index. An index
+	can have maximum 16 columns (MAX_REF_PARTS) in it. Therfore
+	Max size for PK: 16 * 8 bytes (BIGINT's size) = 128 bytes
+	Max size Secondary index: 16 * 8 bytes + PK = 256 bytes. */
+#define MAX_SRCH_KEY_VAL_BUFFER         2* (8 * MAX_REF_PARTS)
+
+#define PREBUILT_HEAP_INITIAL_SIZE	\
+	( \
+	sizeof(*prebuilt) \
+	/* allocd in this function */ \
+	+ DTUPLE_EST_ALLOC(search_tuple_n_fields) \
+	+ DTUPLE_EST_ALLOC(ref_len) \
+	/* allocd in row_prebuild_sel_graph() */ \
+	+ sizeof(sel_node_t) \
+	+ sizeof(que_fork_t) \
+	+ sizeof(que_thr_t) \
+	/* allocd in row_get_prebuilt_update_vector() */ \
+	+ sizeof(upd_node_t) \
+	+ sizeof(upd_t) \
+	+ sizeof(upd_field_t) \
+	  * dict_table_get_n_cols(table) \
+	+ sizeof(que_fork_t) \
+	+ sizeof(que_thr_t) \
+	/* allocd in row_get_prebuilt_insert_row() */ \
+	+ sizeof(ins_node_t) \
+	/* mysql_row_len could be huge and we are not \
+	sure if this prebuilt instance is going to be \
+	used in inserts */ \
+	+ (mysql_row_len < 256 ? mysql_row_len : 0) \
+	+ DTUPLE_EST_ALLOC(dict_table_get_n_cols(table) \
+			   + dict_table_get_n_v_cols(table)) \
+	+ sizeof(que_fork_t) \
+	+ sizeof(que_thr_t) \
+	+ sizeof(*prebuilt->pcur) \
+	+ sizeof(*prebuilt->clust_pcur) \
+	)
+
+	/* Calculate size of key buffer used to store search key in
+	InnoDB format. MySQL stores INTs in little endian format and
+	InnoDB stores INTs in big endian format with the sign bit
+	flipped. All other field types are stored/compared the same
+	in MySQL and InnoDB, so we must create a buffer containing
+	the INT key parts in InnoDB format.We need two such buffers
+	since both start and end keys are used in records_in_range(). */
+
+	for (temp_index = dict_table_get_first_index(table); temp_index;
+	     temp_index = dict_table_get_next_index(temp_index)) {
+		DBUG_EXECUTE_IF("innodb_srch_key_buffer_max_value",
+			ut_a(temp_index->n_user_defined_cols
+						== MAX_REF_PARTS););
+		if (temp_index->is_corrupted()) {
+			continue;
+		}
+
+		uint temp_len = 0;
+		for (uint i = 0; i < temp_index->n_uniq; i++) {
+			ulint type = temp_index->fields[i].col->mtype;
+			if (type == DATA_INT) {
+				temp_len +=
+					temp_index->fields[i].fixed_len;
+			}
+		}
+		srch_key_len = std::max(srch_key_len,temp_len);
+	}
+
+	ut_a(srch_key_len <= MAX_SRCH_KEY_VAL_BUFFER);
+
+	DBUG_EXECUTE_IF("innodb_srch_key_buffer_max_value",
+		ut_a(srch_key_len == MAX_SRCH_KEY_VAL_BUFFER););
+
+	/* We allocate enough space for the objects that are likely to
+	be created later in order to minimize the number of malloc()
+	calls */
+	heap = mem_heap_create(PREBUILT_HEAP_INITIAL_SIZE + 2 * srch_key_len);
+
+	prebuilt = static_cast<row_prebuilt_t*>(
+		mem_heap_zalloc(heap, sizeof(*prebuilt)));
+
+	prebuilt->magic_n = ROW_PREBUILT_ALLOCATED;
+	prebuilt->magic_n2 = ROW_PREBUILT_ALLOCATED;
+
+	prebuilt->table = table;
+
+	prebuilt->sql_stat_start = TRUE;
+	prebuilt->heap = heap;
+
+	prebuilt->srch_key_val_len = srch_key_len;
+	if (prebuilt->srch_key_val_len) {
+		prebuilt->srch_key_val1 = static_cast<byte*>(
+			mem_heap_alloc(prebuilt->heap,
+				       2 * prebuilt->srch_key_val_len));
+		prebuilt->srch_key_val2 = prebuilt->srch_key_val1 +
+						prebuilt->srch_key_val_len;
+	} else {
+		prebuilt->srch_key_val1 = NULL;
+		prebuilt->srch_key_val2 = NULL;
+	}
+
+	prebuilt->pcur = static_cast<btr_pcur_t*>(
+				mem_heap_zalloc(prebuilt->heap,
+					       sizeof(btr_pcur_t)));
+	prebuilt->clust_pcur = static_cast<btr_pcur_t*>(
+					mem_heap_zalloc(prebuilt->heap,
+						       sizeof(btr_pcur_t)));
+	btr_pcur_reset(prebuilt->pcur);
+	btr_pcur_reset(prebuilt->clust_pcur);
+
+	prebuilt->select_lock_type = LOCK_NONE;
+	prebuilt->stored_select_lock_type = LOCK_NONE_UNSET;
+
+	prebuilt->search_tuple = dtuple_create(heap, search_tuple_n_fields);
+
+	ref = dtuple_create(heap, ref_len);
+
+	dict_index_copy_types(ref, clust_index, ref_len);
+
+	prebuilt->clust_ref = ref;
+
+	prebuilt->autoinc_error = DB_SUCCESS;
+	prebuilt->autoinc_offset = 0;
+
+	/* Default to 1, we will set the actual value later in
+	ha_innobase::get_auto_increment(). */
+	prebuilt->autoinc_increment = 1;
+
+	prebuilt->autoinc_last_value = 0;
+
+	/* During UPDATE and DELETE we need the doc id. */
+	prebuilt->fts_doc_id = 0;
+
+	prebuilt->mysql_row_len = mysql_row_len;
+
+	prebuilt->fts_doc_id_in_read_set = 0;
+	prebuilt->blob_heap = NULL;
+
+	DBUG_RETURN(prebuilt);
+}
+
+/** Free a prebuilt struct for a TABLE handle. */
+void row_prebuilt_free(row_prebuilt_t *prebuilt)
+{
+	DBUG_ENTER("row_prebuilt_free");
+
+	ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+	ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
+
+	prebuilt->magic_n = ROW_PREBUILT_FREED;
+	prebuilt->magic_n2 = ROW_PREBUILT_FREED;
+
+	btr_pcur_reset(prebuilt->pcur);
+	btr_pcur_reset(prebuilt->clust_pcur);
+
+	ut_free(prebuilt->mysql_template);
+
+	if (prebuilt->ins_graph) {
+		que_graph_free_recursive(prebuilt->ins_graph);
+	}
+
+	if (prebuilt->sel_graph) {
+		que_graph_free_recursive(prebuilt->sel_graph);
+	}
+
+	if (prebuilt->upd_graph) {
+		que_graph_free_recursive(prebuilt->upd_graph);
+	}
+
+	if (prebuilt->blob_heap) {
+		row_mysql_prebuilt_free_blob_heap(prebuilt);
+	}
+
+	if (prebuilt->old_vers_heap) {
+		mem_heap_free(prebuilt->old_vers_heap);
+	}
+
+	if (prebuilt->fetch_cache[0] != NULL) {
+		byte*	base = prebuilt->fetch_cache[0] - 4;
+		byte*	ptr = base;
+
+		for (ulint i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+			ulint	magic1 = mach_read_from_4(ptr);
+			ut_a(magic1 == ROW_PREBUILT_FETCH_MAGIC_N);
+			ptr += 4;
+
+			byte*	row = ptr;
+			ut_a(row == prebuilt->fetch_cache[i]);
+			ptr += prebuilt->mysql_row_len;
+
+			ulint	magic2 = mach_read_from_4(ptr);
+			ut_a(magic2 == ROW_PREBUILT_FETCH_MAGIC_N);
+			ptr += 4;
+		}
+
+		ut_free(base);
+	}
+
+	if (prebuilt->rtr_info) {
+		rtr_clean_rtr_info(prebuilt->rtr_info, true);
+	}
+	if (prebuilt->table) {
+		dict_table_close(prebuilt->table);
+	}
+
+	mem_heap_free(prebuilt->heap);
+
+	DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Updates the transaction pointers in query graphs stored in the prebuilt
+struct. */
+void
+row_update_prebuilt_trx(
+/*====================*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt struct
+					in MySQL handle */
+	trx_t*		trx)		/*!< in: transaction handle */
+{
+	ut_a(trx->magic_n == TRX_MAGIC_N);
+	ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+	ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
+
+	prebuilt->trx = trx;
+
+	if (prebuilt->ins_graph) {
+		prebuilt->ins_graph->trx = trx;
+	}
+
+	if (prebuilt->upd_graph) {
+		prebuilt->upd_graph->trx = trx;
+	}
+
+	if (prebuilt->sel_graph) {
+		prebuilt->sel_graph->trx = trx;
+	}
+}
+
+/*********************************************************************//**
+Gets pointer to a prebuilt dtuple used in insertions. If the insert graph
+has not yet been built in the prebuilt struct, then this function first
+builds it.
+@return prebuilt dtuple; the column type information is also set in it */
+static
+dtuple_t*
+row_get_prebuilt_insert_row(
+/*========================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	dict_table_t*		table	= prebuilt->table;
+
+	ut_ad(prebuilt && table && prebuilt->trx);
+
+	if (prebuilt->ins_node != 0) {
+
+		/* Check if indexes have been dropped or added and we
+		may need to rebuild the row insert template. */
+
+		if (prebuilt->trx_id == table->def_trx_id
+		    && prebuilt->ins_node->entry_list.size()
+		    == UT_LIST_GET_LEN(table->indexes)) {
+			return(prebuilt->ins_node->row);
+		}
+
+		ut_ad(prebuilt->trx_id < table->def_trx_id);
+
+		que_graph_free_recursive(prebuilt->ins_graph);
+
+		prebuilt->ins_graph = 0;
+	}
+
+	/* Create an insert node and query graph to the prebuilt struct */
+
+	ins_node_t*		node;
+
+	node = ins_node_create(INS_DIRECT, table, prebuilt->heap);
+
+	prebuilt->ins_node = node;
+
+	if (prebuilt->ins_upd_rec_buff == 0) {
+		prebuilt->ins_upd_rec_buff = static_cast<byte*>(
+			mem_heap_alloc(
+				prebuilt->heap,
+				DICT_TF2_FLAG_IS_SET(prebuilt->table,
+						     DICT_TF2_FTS_HAS_DOC_ID)
+				? prebuilt->mysql_row_len + 8/* FTS_DOC_ID */
+				: prebuilt->mysql_row_len));
+	}
+
+	dtuple_t*	row;
+
+	row = dtuple_create_with_vcol(
+			prebuilt->heap, dict_table_get_n_cols(table),
+			dict_table_get_n_v_cols(table));
+
+	dict_table_copy_types(row, table);
+
+	ins_node_set_new_row(node, row);
+	que_thr_t* fork = pars_complete_graph_for_exec(
+		node, prebuilt->trx, prebuilt->heap, prebuilt);
+	fork->state = QUE_THR_RUNNING;
+
+	prebuilt->ins_graph = static_cast<que_fork_t*>(
+		que_node_get_parent(fork));
+
+	prebuilt->ins_graph->state = QUE_FORK_ACTIVE;
+
+	prebuilt->trx_id = table->def_trx_id;
+
+	return(prebuilt->ins_node->row);
+}
+
+/*********************************************************************//**
+Sets an AUTO_INC type lock on the table mentioned in prebuilt. The
+AUTO_INC lock gives exclusive access to the auto-inc counter of the
+table. The lock is reserved only for the duration of an SQL statement.
+It is not compatible with another AUTO_INC or exclusive lock on the
+table.
+@return error code or DB_SUCCESS */
+dberr_t
+row_lock_table_autoinc_for_mysql(
+/*=============================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in the MySQL
+					table handle */
+{
+	trx_t*			trx	= prebuilt->trx;
+	ins_node_t*		node	= prebuilt->ins_node;
+	const dict_table_t*	table	= prebuilt->table;
+	que_thr_t*		thr;
+	dberr_t			err;
+
+	/* If we already hold an AUTOINC lock on the table then do nothing.
+	Note: We peek at the value of the current owner without acquiring
+	lock_sys.latch. */
+	if (trx == table->autoinc_trx) {
+
+		return(DB_SUCCESS);
+	}
+
+	trx->op_info = "setting auto-inc lock";
+
+	row_get_prebuilt_insert_row(prebuilt);
+	node = prebuilt->ins_node;
+
+	/* We use the insert query graph as the dummy graph needed
+	in the lock module call */
+
+	thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+	do {
+		thr->run_node = node;
+		thr->prev_node = node;
+
+		/* It may be that the current session has not yet started
+		its transaction, or it has been committed: */
+
+		trx_start_if_not_started_xa(trx, true);
+
+		err = lock_table(prebuilt->table, NULL, LOCK_AUTO_INC, thr);
+
+		trx->error_state = err;
+	} while (err != DB_SUCCESS
+		 && row_mysql_handle_errors(&err, trx, thr, NULL));
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/** Lock a table.
+@param[in,out]	prebuilt	table handle
+@return error code or DB_SUCCESS */
+dberr_t
+row_lock_table(row_prebuilt_t* prebuilt)
+{
+	trx_t*		trx		= prebuilt->trx;
+	que_thr_t*	thr;
+	dberr_t		err;
+
+	trx->op_info = "setting table lock";
+
+	if (prebuilt->sel_graph == NULL) {
+		/* Build a dummy select query graph */
+		row_prebuild_sel_graph(prebuilt);
+	}
+
+	/* We use the select query graph as the dummy graph needed
+	in the lock module call */
+
+	thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+	do {
+		thr->run_node = thr;
+		thr->prev_node = thr->common.parent;
+
+		/* It may be that the current session has not yet started
+		its transaction, or it has been committed: */
+
+		trx_start_if_not_started_xa(trx, false);
+
+		err = lock_table(prebuilt->table, NULL, static_cast<lock_mode>(
+					 prebuilt->select_lock_type), thr);
+		trx->error_state = err;
+	} while (err != DB_SUCCESS
+		 && row_mysql_handle_errors(&err, trx, thr, NULL));
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/** Determine is tablespace encrypted but decryption failed, is table corrupted
+or is tablespace .ibd file missing.
+@param[in]	table		Table
+@param[in]	trx		Transaction
+@param[in]	push_warning	true if we should push warning to user
+@retval	DB_DECRYPTION_FAILED	table is encrypted but decryption failed
+@retval	DB_CORRUPTION		table is corrupted
+@retval	DB_TABLESPACE_NOT_FOUND	tablespace .ibd file not found */
+static
+dberr_t
+row_mysql_get_table_status(
+	const dict_table_t*	table,
+	trx_t*			trx,
+	bool 			push_warning = true)
+{
+	dberr_t err;
+	if (const fil_space_t* space = table->space) {
+		if (space->crypt_data && space->crypt_data->is_encrypted()) {
+			// maybe we cannot access the table due to failing
+			// to decrypt
+			if (push_warning) {
+				ib_push_warning(trx, DB_DECRYPTION_FAILED,
+					"Table %s is encrypted."
+					"However key management plugin or used key_id is not found or"
+					" used encryption algorithm or method does not match.",
+					table->name.m_name);
+			}
+
+			err = DB_DECRYPTION_FAILED;
+		} else {
+			if (push_warning) {
+				ib_push_warning(trx, DB_CORRUPTION,
+					"Table %s in tablespace %lu corrupted.",
+					table->name.m_name, table->space);
+			}
+
+			err = DB_CORRUPTION;
+		}
+	} else {
+		ib::error() << ".ibd file is missing for table "
+			<< table->name;
+		err = DB_TABLESPACE_NOT_FOUND;
+	}
+
+	return(err);
+}
+
+/** Does an insert for MySQL.
+@param[in]	mysql_rec	row in the MySQL format
+@param[in,out]	prebuilt	prebuilt struct in MySQL handle
+@return error code or DB_SUCCESS */
+dberr_t
+row_insert_for_mysql(
+	const byte*	mysql_rec,
+	row_prebuilt_t*	prebuilt,
+	ins_mode_t	ins_mode)
+{
+	trx_savept_t	savept;
+	que_thr_t*	thr;
+	dberr_t		err;
+	ibool		was_lock_wait;
+	trx_t*		trx		= prebuilt->trx;
+	ins_node_t*	node		= prebuilt->ins_node;
+	dict_table_t*	table		= prebuilt->table;
+
+	/* FIX_ME: This blob heap is used to compensate an issue in server
+	for virtual column blob handling */
+	mem_heap_t*	blob_heap = NULL;
+
+	ut_ad(trx);
+	ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+	ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
+
+	if (!table->space) {
+		ib::error() << "The table " << table->name
+			<< " doesn't have a corresponding tablespace, it was"
+			" discarded.";
+
+		return(DB_TABLESPACE_DELETED);
+	} else if (!table->is_readable()) {
+		return row_mysql_get_table_status(table, trx, true);
+	} else if (high_level_read_only) {
+		return(DB_READ_ONLY);
+	} else if (UNIV_UNLIKELY(table->corrupted)
+		   || dict_table_get_first_index(table)->is_corrupted()) {
+		return DB_TABLE_CORRUPT;
+	}
+
+	trx->op_info = "inserting";
+
+	row_mysql_delay_if_needed();
+
+	if (!table->no_rollback()) {
+		trx_start_if_not_started_xa(trx, true);
+	}
+
+	row_get_prebuilt_insert_row(prebuilt);
+	node = prebuilt->ins_node;
+
+	row_mysql_convert_row_to_innobase(node->row, prebuilt, mysql_rec,
+					  &blob_heap);
+
+	if (ins_mode != ROW_INS_NORMAL) {
+          node->vers_update_end(prebuilt, ins_mode == ROW_INS_HISTORICAL);
+        }
+
+	/* Because we now allow multiple INSERT into the same
+	initially empty table in bulk insert mode, on error we must
+	roll back to the start of the transaction. For correctness, it
+	would suffice to roll back to the start of the first insert
+	into this empty table, but we will keep it simple and efficient. */
+	savept.least_undo_no = trx->bulk_insert ? 0 : trx->undo_no;
+
+	thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+	if (prebuilt->sql_stat_start) {
+		node->state = INS_NODE_SET_IX_LOCK;
+		prebuilt->sql_stat_start = FALSE;
+	} else {
+		node->state = INS_NODE_ALLOC_ROW_ID;
+		node->trx_id = trx->id;
+	}
+
+run_again:
+	thr->run_node = node;
+	thr->prev_node = node;
+
+	row_ins_step(thr);
+
+	DEBUG_SYNC_C("ib_after_row_insert_step");
+
+	err = trx->error_state;
+
+	if (err != DB_SUCCESS) {
+error_exit:
+		/* FIXME: What's this ? */
+		thr->lock_state = QUE_THR_LOCK_ROW;
+
+		was_lock_wait = row_mysql_handle_errors(
+			&err, trx, thr, &savept);
+
+		thr->lock_state = QUE_THR_LOCK_NOLOCK;
+
+		if (was_lock_wait) {
+			ut_ad(node->state == INS_NODE_INSERT_ENTRIES
+			      || node->state == INS_NODE_ALLOC_ROW_ID
+			      || node->state == INS_NODE_SET_IX_LOCK);
+			goto run_again;
+		}
+
+		trx->op_info = "";
+
+		if (blob_heap != NULL) {
+			mem_heap_free(blob_heap);
+		}
+
+		return(err);
+	}
+
+	if (dict_table_has_fts_index(table)
+	    && (!table->versioned()
+		|| !node->row->fields[table->vers_end].vers_history_row())) {
+
+		doc_id_t	doc_id;
+
+		/* Extract the doc id from the hidden FTS column */
+		doc_id = fts_get_doc_id_from_row(table, node->row);
+
+		if (doc_id <= 0) {
+			ib::error() << "FTS_DOC_ID must be larger than 0 for table "
+				    << table->name;
+			err = DB_FTS_INVALID_DOCID;
+			trx->error_state = DB_FTS_INVALID_DOCID;
+			goto error_exit;
+		}
+
+		if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+			doc_id_t	next_doc_id
+				= table->fts->cache->next_doc_id;
+
+			if (doc_id < next_doc_id) {
+				ib::error() << "FTS_DOC_ID must be larger than "
+					<< next_doc_id - 1 << " for table "
+					<< table->name;
+
+				err = DB_FTS_INVALID_DOCID;
+				trx->error_state = DB_FTS_INVALID_DOCID;
+				goto error_exit;
+			}
+		}
+
+		if (table->skip_alter_undo) {
+			if (trx->fts_trx == NULL) {
+				trx->fts_trx = fts_trx_create(trx);
+			}
+
+			fts_trx_table_t ftt;
+			ftt.table = table;
+			ftt.fts_trx = trx->fts_trx;
+
+			fts_add_doc_from_tuple(&ftt, doc_id, node->row);
+		} else {
+			/* Pass NULL for the columns affected, since an INSERT affects
+			all FTS indexes. */
+			fts_trx_add_op(trx, table, doc_id, FTS_INSERT, NULL);
+		}
+	}
+
+	/* Not protected by dict_sys.latch or table->stats_mutex_lock()
+	for performance
+	reasons, we would rather get garbage in stat_n_rows (which is
+	just an estimate anyway) than protecting the following code
+	with a latch. */
+	dict_table_n_rows_inc(table);
+
+	if (prebuilt->clust_index_was_generated) {
+		/* set row id to prebuilt */
+		memcpy(prebuilt->row_id, node->sys_buf, DATA_ROW_ID_LEN);
+	}
+
+	dict_stats_update_if_needed(table, *trx);
+	trx->op_info = "";
+
+	if (blob_heap != NULL) {
+		mem_heap_free(blob_heap);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Builds a dummy query graph used in selects. */
+void
+row_prebuild_sel_graph(
+/*===================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	sel_node_t*	node;
+
+	ut_ad(prebuilt && prebuilt->trx);
+
+	if (prebuilt->sel_graph == NULL) {
+
+		node = sel_node_create(prebuilt->heap);
+
+		que_thr_t* fork = pars_complete_graph_for_exec(
+			node, prebuilt->trx, prebuilt->heap, prebuilt);
+		fork->state = QUE_THR_RUNNING;
+
+		prebuilt->sel_graph = static_cast<que_fork_t*>(
+			que_node_get_parent(fork));
+
+		prebuilt->sel_graph->state = QUE_FORK_ACTIVE;
+	}
+}
+
+/*********************************************************************//**
+Creates an query graph node of 'update' type to be used in the MySQL
+interface.
+@return own: update node */
+upd_node_t*
+row_create_update_node_for_mysql(
+/*=============================*/
+	dict_table_t*	table,	/*!< in: table to update */
+	mem_heap_t*	heap)	/*!< in: mem heap from which allocated */
+{
+	upd_node_t*	node;
+
+	DBUG_ENTER("row_create_update_node_for_mysql");
+
+	node = upd_node_create(heap);
+
+	node->in_mysql_interface = true;
+	node->is_delete = NO_DELETE;
+	node->pcur = new (mem_heap_alloc(heap, sizeof(btr_pcur_t)))
+		btr_pcur_t();
+
+	node->table = table;
+
+	node->update = upd_create(dict_table_get_n_cols(table)
+				  + dict_table_get_n_v_cols(table), heap);
+
+	node->update_n_fields = dict_table_get_n_cols(table);
+
+	UT_LIST_INIT(node->columns, &sym_node_t::col_var_list);
+
+	node->has_clust_rec_x_lock = TRUE;
+
+	DBUG_RETURN(node);
+}
+
+/*********************************************************************//**
+Gets pointer to a prebuilt update vector used in updates. If the update
+graph has not yet been built in the prebuilt struct, then this function
+first builds it.
+@return prebuilt update vector */
+upd_t*
+row_get_prebuilt_update_vector(
+/*===========================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	if (prebuilt->upd_node == NULL) {
+
+		/* Not called before for this handle: create an update node
+		and query graph to the prebuilt struct */
+
+		prebuilt->upd_node = row_create_update_node_for_mysql(
+			prebuilt->table, prebuilt->heap);
+
+		prebuilt->upd_graph = static_cast<que_fork_t*>(
+			que_node_get_parent(
+				pars_complete_graph_for_exec(
+					prebuilt->upd_node,
+					prebuilt->trx, prebuilt->heap,
+					prebuilt)));
+
+		prebuilt->upd_graph->state = QUE_FORK_ACTIVE;
+	}
+
+	return(prebuilt->upd_node->update);
+}
+
+/********************************************************************
+Handle an update of a column that has an FTS index. */
+static
+void
+row_fts_do_update(
+/*==============*/
+	trx_t*		trx,		/* in: transaction */
+	dict_table_t*	table,		/* in: Table with FTS index */
+	doc_id_t	old_doc_id,	/* in: old document id */
+	doc_id_t	new_doc_id)	/* in: new document id */
+{
+	if(trx->fts_next_doc_id) {
+		fts_trx_add_op(trx, table, old_doc_id, FTS_DELETE, NULL);
+		if(new_doc_id != FTS_NULL_DOC_ID)
+		fts_trx_add_op(trx, table, new_doc_id, FTS_INSERT, NULL);
+	}
+}
+
+/************************************************************************
+Handles FTS matters for an update or a delete.
+NOTE: should not be called if the table does not have an FTS index. .*/
+static
+dberr_t
+row_fts_update_or_delete(
+/*=====================*/
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct in MySQL
+					handle */
+{
+	trx_t*		trx = prebuilt->trx;
+	dict_table_t*	table = prebuilt->table;
+	upd_node_t*	node = prebuilt->upd_node;
+	doc_id_t	old_doc_id = prebuilt->fts_doc_id;
+
+	DBUG_ENTER("row_fts_update_or_delete");
+
+	ut_a(dict_table_has_fts_index(prebuilt->table));
+
+	/* Deletes are simple; get them out of the way first. */
+	if (node->is_delete) {
+		/* A delete affects all FTS indexes, so we pass NULL */
+		fts_trx_add_op(trx, table, old_doc_id, FTS_DELETE, NULL);
+	} else {
+		doc_id_t	new_doc_id;
+		new_doc_id = fts_read_doc_id((byte*) &trx->fts_next_doc_id);
+
+		if (new_doc_id == 0) {
+			ib::error() << "InnoDB FTS: Doc ID cannot be 0";
+			DBUG_RETURN(DB_FTS_INVALID_DOCID);
+		}
+		row_fts_do_update(trx, table, old_doc_id, new_doc_id);
+	}
+
+	DBUG_RETURN(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Initialize the Doc ID system for FK table with FTS index */
+static
+void
+init_fts_doc_id_for_ref(
+/*====================*/
+	dict_table_t*	table,		/*!< in: table */
+	ulint*		depth)		/*!< in: recusive call depth */
+{
+	table->fk_max_recusive_level = 0;
+
+	/* Limit on tables involved in cascading delete/update */
+	if (++*depth > FK_MAX_CASCADE_DEL) {
+		return;
+	}
+
+	/* Loop through this table's referenced list and also
+	recursively traverse each table's foreign table list */
+	for (dict_foreign_t* foreign : table->referenced_set) {
+		ut_ad(foreign->foreign_table);
+
+		if (foreign->foreign_table->fts) {
+			fts_init_doc_id(foreign->foreign_table);
+		}
+
+		if (foreign->foreign_table != table
+		    && !foreign->foreign_table->referenced_set.empty()) {
+			init_fts_doc_id_for_ref(
+				foreign->foreign_table, depth);
+		}
+	}
+}
+
+/** Does an update or delete of a row for MySQL.
+@param[in,out]	prebuilt	prebuilt struct in MySQL handle
+@return error code or DB_SUCCESS */
+dberr_t
+row_update_for_mysql(row_prebuilt_t* prebuilt)
+{
+	trx_savept_t	savept;
+	dberr_t		err;
+	que_thr_t*	thr;
+	dict_index_t*	clust_index;
+	upd_node_t*	node;
+	dict_table_t*	table		= prebuilt->table;
+	trx_t*		trx		= prebuilt->trx;
+	ulint		fk_depth	= 0;
+
+	DBUG_ENTER("row_update_for_mysql");
+
+	ut_ad(trx);
+	ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+	ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
+	ut_a(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
+	ut_ad(table->stat_initialized);
+
+	if (!table->is_readable()) {
+		return(row_mysql_get_table_status(table, trx, true));
+	}
+
+	if (high_level_read_only) {
+		return(DB_READ_ONLY);
+	}
+
+	DEBUG_SYNC_C("innodb_row_update_for_mysql_begin");
+
+	trx->op_info = "updating or deleting";
+
+	row_mysql_delay_if_needed();
+
+	init_fts_doc_id_for_ref(table, &fk_depth);
+
+	if (!table->no_rollback()) {
+		trx_start_if_not_started_xa(trx, true);
+	}
+
+	node = prebuilt->upd_node;
+	const bool is_delete = node->is_delete == PLAIN_DELETE;
+	ut_ad(node->table == table);
+
+	clust_index = dict_table_get_first_index(table);
+
+	btr_pcur_copy_stored_position(node->pcur,
+				      prebuilt->pcur->index() == clust_index
+				      ? prebuilt->pcur
+				      : prebuilt->clust_pcur);
+
+	ut_a(node->pcur->rel_pos == BTR_PCUR_ON);
+
+	/* MySQL seems to call rnd_pos before updating each row it
+	has cached: we can get the correct cursor position from
+	prebuilt->pcur; NOTE that we cannot build the row reference
+	from mysql_rec if the clustered index was automatically
+	generated for the table: MySQL does not know anything about
+	the row id used as the clustered index key */
+
+	savept.least_undo_no = trx->undo_no;
+
+	thr = que_fork_get_first_thr(prebuilt->upd_graph);
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	ut_ad(!prebuilt->sql_stat_start);
+
+	ut_ad(!prebuilt->versioned_write || node->table->versioned());
+
+	if (prebuilt->versioned_write && node->is_delete == VERSIONED_DELETE) {
+		node->vers_make_delete(trx);
+	}
+
+	for (;;) {
+		thr->run_node = node;
+		thr->prev_node = node;
+		thr->fk_cascade_depth = 0;
+
+		row_upd_step(thr);
+
+		err = trx->error_state;
+
+		if (err == DB_SUCCESS) {
+			break;
+		}
+
+		if (err == DB_RECORD_NOT_FOUND) {
+			trx->error_state = DB_SUCCESS;
+			goto error;
+		}
+
+		thr->lock_state= QUE_THR_LOCK_ROW;
+
+		DEBUG_SYNC(trx->mysql_thd, "row_update_for_mysql_error");
+
+		bool was_lock_wait = row_mysql_handle_errors(
+			&err, trx, thr, &savept);
+		thr->lock_state= QUE_THR_LOCK_NOLOCK;
+
+		if (!was_lock_wait) {
+			goto error;
+		}
+	}
+
+	if (dict_table_has_fts_index(table)
+	    && trx->fts_next_doc_id != UINT64_UNDEFINED) {
+		err = row_fts_update_or_delete(prebuilt);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			ut_ad("unexpected error" == 0);
+			goto error;
+		}
+	}
+
+	/* Completed cascading operations (if any) */
+	bool	update_statistics;
+	ut_ad(is_delete == (node->is_delete == PLAIN_DELETE));
+
+	if (is_delete) {
+		/* Not protected by dict_sys.latch
+		or prebuilt->table->stats_mutex_lock() for performance
+		reasons, we would rather get garbage in stat_n_rows (which is
+		just an estimate anyway) than protecting the following code
+		with a latch. */
+		dict_table_n_rows_dec(prebuilt->table);
+
+		update_statistics = !srv_stats_include_delete_marked;
+	} else {
+		update_statistics
+			= !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE);
+	}
+
+	if (update_statistics) {
+		dict_stats_update_if_needed(prebuilt->table, *trx);
+	} else {
+		/* Always update the table modification counter. */
+		prebuilt->table->stat_modified_counter++;
+	}
+
+error:
+	trx->op_info = "";
+	DBUG_RETURN(err);
+}
+
+/** This can only be used when the current transaction is at
+READ COMMITTED or READ UNCOMMITTED isolation level.
+Before calling this function row_search_mvcc() must have
+initialized prebuilt->new_rec_locks to store the information which new
+record locks really were set. This function removes a newly set
+clustered index record lock under prebuilt->pcur or
+prebuilt->clust_pcur.  Thus, this implements a 'mini-rollback' that
+releases the latest clustered index record lock we set.
+@param[in,out]	prebuilt		prebuilt struct in MySQL handle
+@param[in]	has_latches_on_recs	TRUE if called so that we have the
+					latches on the records under pcur
+					and clust_pcur, and we do not need
+					to reposition the cursors. */
+void
+row_unlock_for_mysql(
+	row_prebuilt_t*	prebuilt,
+	ibool		has_latches_on_recs)
+{
+	if (prebuilt->new_rec_locks == 1 && prebuilt->index->is_clust()) {
+		trx_t* trx = prebuilt->trx;
+		ut_ad(trx->isolation_level <= TRX_ISO_READ_COMMITTED);
+		trx->op_info = "unlock_row";
+
+		const rec_t*	rec;
+		dict_index_t*	index;
+		trx_id_t	rec_trx_id;
+		mtr_t		mtr;
+		btr_pcur_t*	pcur	= prebuilt->pcur;
+
+		mtr_start(&mtr);
+
+		/* Restore the cursor position and find the record */
+
+		if (!has_latches_on_recs
+		    && pcur->restore_position(BTR_SEARCH_LEAF, &mtr)
+		    != btr_pcur_t::SAME_ALL) {
+			goto no_unlock;
+		}
+
+		rec = btr_pcur_get_rec(pcur);
+		index = pcur->index();
+
+		/* If the record has been modified by this
+		transaction, do not unlock it. */
+
+		if (index->trx_id_offset) {
+			rec_trx_id = trx_read_trx_id(rec
+						     + index->trx_id_offset);
+		} else {
+			mem_heap_t*	heap			= NULL;
+			rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+			rec_offs* offsets				= offsets_;
+
+			rec_offs_init(offsets_);
+			offsets = rec_get_offsets(rec, index, offsets,
+						  index->n_core_fields,
+						  ULINT_UNDEFINED, &heap);
+
+			rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+		}
+
+		if (rec_trx_id != trx->id) {
+			/* We did not update the record: unlock it */
+
+			rec = btr_pcur_get_rec(pcur);
+
+			lock_rec_unlock(
+				trx,
+				btr_pcur_get_block(pcur)->page.id(),
+				rec,
+				static_cast<enum lock_mode>(
+					prebuilt->select_lock_type));
+		}
+no_unlock:
+		mtr_commit(&mtr);
+		trx->op_info = "";
+	}
+}
+
+/** Write query start time as SQL field data to a buffer. Needed by InnoDB.
+@param	thd	Thread object
+@param	buf	Buffer to hold start time data */
+void thd_get_query_start_data(THD *thd, char *buf);
+
+/** Insert history row when evaluating foreign key referential action.
+
+1. Create new dtuple_t 'row' from node->historical_row;
+2. Update its row_end to current timestamp;
+3. Insert it to a table;
+4. Update table statistics.
+
+This is used in UPDATE CASCADE/SET NULL of a system versioned referenced table.
+
+node->historical_row: dtuple_t containing pointers of row changed by refertial
+action.
+
+@param[in]	thr	current query thread
+@param[in]	node	a node which just updated a row in a foreign table
+@return DB_SUCCESS or some error */
+static dberr_t row_update_vers_insert(que_thr_t* thr, upd_node_t* node)
+{
+	trx_t* trx = thr_get_trx(thr);
+	dfield_t* row_end;
+	char row_end_data[8];
+	dict_table_t* table = node->table;
+	const unsigned zip_size = table->space->zip_size();
+	ut_ad(table->versioned());
+
+	dtuple_t*       row;
+	const ulint     n_cols        = dict_table_get_n_cols(table);
+	const ulint     n_v_cols      = dict_table_get_n_v_cols(table);
+
+	ut_ad(n_cols == dtuple_get_n_fields(node->historical_row));
+	ut_ad(n_v_cols == dtuple_get_n_v_fields(node->historical_row));
+
+	row = dtuple_create_with_vcol(node->historical_heap, n_cols, n_v_cols);
+
+	dict_table_copy_types(row, table);
+
+	ins_node_t* insert_node =
+		ins_node_create(INS_DIRECT, table, node->historical_heap);
+
+	if (!insert_node) {
+		trx->error_state = DB_OUT_OF_MEMORY;
+		goto exit;
+	}
+
+	insert_node->common.parent = thr;
+	ins_node_set_new_row(insert_node, row);
+
+	ut_ad(n_cols > DATA_N_SYS_COLS);
+	// Exclude DB_ROW_ID, DB_TRX_ID, DB_ROLL_PTR
+	for (ulint i = 0; i < n_cols - DATA_N_SYS_COLS; i++) {
+		dfield_t *src= dtuple_get_nth_field(node->historical_row, i);
+		dfield_t *dst= dtuple_get_nth_field(row, i);
+		dfield_copy(dst, src);
+		if (dfield_is_ext(src)) {
+			byte *field_data
+				= static_cast<byte*>(dfield_get_data(src));
+			ulint ext_len;
+			ulint field_len = dfield_get_len(src);
+
+			ut_a(field_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+			ut_a(memcmp(field_data + field_len
+				     - BTR_EXTERN_FIELD_REF_SIZE,
+				     field_ref_zero,
+				     BTR_EXTERN_FIELD_REF_SIZE));
+
+			byte *data = btr_copy_externally_stored_field(
+				&ext_len, field_data, zip_size, field_len,
+				node->historical_heap);
+			dfield_set_data(dst, data, ext_len);
+		}
+	}
+
+	for (ulint i = 0; i < n_v_cols; i++) {
+		dfield_t *dst= dtuple_get_nth_v_field(row, i);
+		dfield_t *src= dtuple_get_nth_v_field(node->historical_row, i);
+		dfield_copy(dst, src);
+	}
+
+	node->historical_row = NULL;
+
+	row_end = dtuple_get_nth_field(row, table->vers_end);
+	if (dict_table_get_nth_col(table, table->vers_end)->vers_native()) {
+		mach_write_to_8(row_end_data, trx->id);
+		dfield_set_data(row_end, row_end_data, 8);
+	} else {
+		thd_get_query_start_data(trx->mysql_thd, row_end_data);
+		dfield_set_data(row_end, row_end_data, 7);
+	}
+
+	for (;;) {
+		thr->run_node = insert_node;
+		thr->prev_node = insert_node;
+
+		row_ins_step(thr);
+
+		switch (trx->error_state) {
+		case DB_LOCK_WAIT:
+			if (lock_wait(thr) == DB_SUCCESS) {
+				continue;
+			}
+
+			/* fall through */
+		default:
+			/* Other errors are handled for the parent node. */
+			thr->fk_cascade_depth = 0;
+			goto exit;
+
+		case DB_SUCCESS:
+			dict_stats_update_if_needed(table, *trx);
+			goto exit;
+		}
+	}
+exit:
+	que_graph_free_recursive(insert_node);
+	mem_heap_free(node->historical_heap);
+	node->historical_heap = NULL;
+	return trx->error_state;
+}
+
+/**********************************************************************//**
+Does a cascaded delete or set null in a foreign key operation.
+@return error code or DB_SUCCESS */
+dberr_t
+row_update_cascade_for_mysql(
+/*=========================*/
+        que_thr_t*      thr,    /*!< in: query thread */
+        upd_node_t*     node,   /*!< in: update node used in the cascade
+                                or set null operation */
+        dict_table_t*   table)  /*!< in: table where we do the operation */
+{
+        /* Increment fk_cascade_depth to record the recursive call depth on
+        a single update/delete that affects multiple tables chained
+        together with foreign key relations. */
+
+        if (++thr->fk_cascade_depth > FK_MAX_CASCADE_DEL) {
+                return(DB_FOREIGN_EXCEED_MAX_CASCADE);
+        }
+
+	trx_t* trx = thr_get_trx(thr);
+
+	if (table->versioned()) {
+		if (node->is_delete == PLAIN_DELETE) {
+                  node->vers_make_delete(trx);
+                } else if (node->update->affects_versioned()) {
+			dberr_t err = row_update_vers_insert(thr, node);
+			if (err != DB_SUCCESS) {
+				return err;
+			}
+                        node->vers_make_update(trx);
+                }
+	}
+
+	for (;;) {
+		thr->run_node = node;
+		thr->prev_node = node;
+
+		DEBUG_SYNC_C("foreign_constraint_update_cascade");
+		{
+			TABLE *mysql_table = thr->prebuilt->m_mysql_table;
+			thr->prebuilt->m_mysql_table = NULL;
+			row_upd_step(thr);
+			thr->prebuilt->m_mysql_table = mysql_table;
+		}
+
+		switch (trx->error_state) {
+		case DB_LOCK_WAIT:
+			if (lock_wait(thr) == DB_SUCCESS) {
+				continue;
+			}
+
+			/* fall through */
+		default:
+			/* Other errors are handled for the parent node. */
+			thr->fk_cascade_depth = 0;
+			return trx->error_state;
+
+		case DB_SUCCESS:
+			thr->fk_cascade_depth = 0;
+			bool stats;
+
+			if (node->is_delete == PLAIN_DELETE) {
+				/* Not protected by dict_sys.latch
+				or node->table->stats_mutex_lock() for
+				performance reasons, we would rather
+				get garbage in stat_n_rows (which is
+				just an estimate anyway) than
+				protecting the following code with a
+				latch. */
+				dict_table_n_rows_dec(node->table);
+
+				stats = !srv_stats_include_delete_marked;
+			} else {
+				stats = !(node->cmpl_info
+					  & UPD_NODE_NO_ORD_CHANGE);
+			}
+
+			if (stats) {
+				dict_stats_update_if_needed(node->table, *trx);
+			} else {
+				/* Always update the table
+				modification counter. */
+				node->table->stat_modified_counter++;
+			}
+
+			return(DB_SUCCESS);
+		}
+	}
+}
+
+/*********************************************************************//**
+Creates a table for MySQL. On failure the transaction will be rolled back
+and the 'table' object will be freed.
+@return error code or DB_SUCCESS */
+dberr_t
+row_create_table_for_mysql(
+/*=======================*/
+	dict_table_t*	table,	/*!< in, own: table definition
+				(will be freed, or on DB_SUCCESS
+				added to the data dictionary cache) */
+	trx_t*		trx)	/*!< in/out: transaction */
+{
+	tab_node_t*	node;
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+
+	ut_ad(trx->state == TRX_STATE_ACTIVE);
+	ut_ad(dict_sys.sys_tables_exist());
+	ut_ad(dict_sys.locked());
+	ut_ad(trx->dict_operation_lock_mode);
+
+	DEBUG_SYNC_C("create_table");
+
+	DBUG_EXECUTE_IF(
+		"ib_create_table_fail_at_start_of_row_create_table_for_mysql",
+		dict_mem_table_free(table); return DB_ERROR;
+	);
+
+	trx->op_info = "creating table";
+
+	heap = mem_heap_create(512);
+
+	trx->dict_operation = true;
+
+	node = tab_create_graph_create(table, heap);
+
+	thr = pars_complete_graph_for_exec(node, trx, heap, NULL);
+
+	ut_a(thr == que_fork_start_command(
+			static_cast<que_fork_t*>(que_node_get_parent(thr))));
+
+	que_run_threads(thr);
+
+	dberr_t err = trx->error_state;
+
+	if (err != DB_SUCCESS) {
+		trx->error_state = DB_SUCCESS;
+		trx->rollback();
+		dict_mem_table_free(table);
+	}
+
+	que_graph_free((que_t*) que_node_get_parent(thr));
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Create an index when creating a table.
+On failure, the caller must drop the table!
+@return error number or DB_SUCCESS */
+dberr_t
+row_create_index_for_mysql(
+/*=======================*/
+	dict_index_t*	index,		/*!< in, own: index definition
+					(will be freed) */
+	trx_t*		trx,		/*!< in: transaction handle */
+	const ulint*	field_lengths,	/*!< in: if not NULL, must contain
+					dict_index_get_n_fields(index)
+					actual field lengths for the
+					index columns, which are
+					then checked for not being too
+					large. */
+	fil_encryption_t mode,		/*!< in: encryption mode */
+	uint32_t	key_id)		/*!< in: encryption key_id */
+{
+	ind_node_t*	node;
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+	dberr_t		err;
+	ulint		i;
+	ulint		len;
+	dict_table_t*	table = index->table;
+
+	ut_ad(dict_sys.locked());
+
+	for (i = 0; i < index->n_def; i++) {
+		/* Check that prefix_len and actual length
+		< DICT_MAX_INDEX_COL_LEN */
+
+		len = dict_index_get_nth_field(index, i)->prefix_len;
+
+		if (field_lengths && field_lengths[i]) {
+			len = ut_max(len, field_lengths[i]);
+		}
+
+		DBUG_EXECUTE_IF(
+			"ib_create_table_fail_at_create_index",
+			len = DICT_MAX_FIELD_LEN_BY_FORMAT(table) + 1;
+		);
+
+		/* Column or prefix length exceeds maximum column length */
+		if (len > (ulint) DICT_MAX_FIELD_LEN_BY_FORMAT(table)) {
+			dict_mem_index_free(index);
+			return DB_TOO_BIG_INDEX_COL;
+		}
+	}
+
+	/* For temp-table we avoid insertion into SYSTEM TABLES to
+	maintain performance and so we have separate path that directly
+	just updates dictonary cache. */
+	if (!table->is_temporary()) {
+		ut_ad(trx->state == TRX_STATE_ACTIVE);
+		ut_ad(trx->dict_operation);
+		trx->op_info = "creating index";
+
+		/* Note that the space id where we store the index is
+		inherited from the table in dict_build_index_def_step()
+		in dict0crea.cc. */
+
+		heap = mem_heap_create(512);
+		node = ind_create_graph_create(index, table->name.m_name,
+					       heap, mode, key_id);
+
+		thr = pars_complete_graph_for_exec(node, trx, heap, NULL);
+
+		ut_a(thr == que_fork_start_command(
+				static_cast<que_fork_t*>(
+					que_node_get_parent(thr))));
+
+		que_run_threads(thr);
+
+		err = trx->error_state;
+
+		index = node->index;
+
+		ut_ad(!index == (err != DB_SUCCESS));
+
+		que_graph_free((que_t*) que_node_get_parent(thr));
+
+		if (index && (index->type & DICT_FTS)) {
+			err = fts_create_index_tables(trx, index, table->id);
+		}
+
+		trx->op_info = "";
+	} else {
+		dict_build_index_def(table, index, trx);
+
+		err = dict_index_add_to_cache(index, FIL_NULL);
+		ut_ad((index == NULL) == (err != DB_SUCCESS));
+		if (UNIV_LIKELY(err == DB_SUCCESS)) {
+			ut_ad(!index->is_instant());
+			index->n_core_null_bytes = static_cast<uint8_t>(
+				UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
+
+			err = dict_create_index_tree_in_mem(index, trx);
+#ifdef BTR_CUR_HASH_ADAPT
+			ut_ad(!index->search_info->ref_count);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+			if (err != DB_SUCCESS) {
+				dict_index_remove_from_cache(table, index);
+			}
+		}
+	}
+
+	return(err);
+}
+
+/** Reassigns the table identifier of a table.
+@param[in,out]	table	table
+@param[in,out]	trx	transaction
+@param[out]	new_id	new table id
+@return error code or DB_SUCCESS */
+static
+dberr_t
+row_mysql_table_id_reassign(
+	dict_table_t*	table,
+	trx_t*		trx,
+	table_id_t*	new_id)
+{
+	if (!dict_sys.sys_tables || dict_sys.sys_tables->corrupted ||
+	    !dict_sys.sys_columns || dict_sys.sys_columns->corrupted ||
+	    !dict_sys.sys_indexes || dict_sys.sys_indexes->corrupted ||
+	    !dict_sys.sys_virtual || dict_sys.sys_virtual->corrupted) {
+		return DB_CORRUPTION;
+	}
+
+	dberr_t		err;
+	pars_info_t*	info	= pars_info_create();
+
+	dict_hdr_get_new_id(new_id, NULL, NULL);
+
+	pars_info_add_ull_literal(info, "old_id", table->id);
+	pars_info_add_ull_literal(info, "new_id", *new_id);
+
+	/* Note: This cannot be rolled back. Rollback would see the
+	UPDATE SYS_INDEXES as two operations: DELETE and INSERT.
+	It would invoke btr_free_if_exists() when rolling back the
+	INSERT, effectively dropping all indexes of the table. */
+	err = que_eval_sql(
+		info,
+		"PROCEDURE RENUMBER_TABLE_PROC () IS\n"
+		"BEGIN\n"
+		"UPDATE SYS_TABLES SET ID = :new_id\n"
+		" WHERE ID = :old_id;\n"
+		"UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n"
+		" WHERE TABLE_ID = :old_id;\n"
+		"UPDATE SYS_INDEXES SET TABLE_ID = :new_id\n"
+		" WHERE TABLE_ID = :old_id;\n"
+		"UPDATE SYS_VIRTUAL SET TABLE_ID = :new_id\n"
+		" WHERE TABLE_ID = :old_id;\n"
+		"END;\n", trx);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Do the foreign key constraint checks.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_discard_tablespace_foreign_key_checks(
+/*======================================*/
+	const trx_t*		trx,	/*!< in: transaction handle */
+	const dict_table_t*	table)	/*!< in: table to be discarded */
+{
+
+	if (srv_read_only_mode || !trx->check_foreigns) {
+		return(DB_SUCCESS);
+	}
+
+	/* Check if the table is referenced by foreign key constraints from
+	some other table (not the table itself) */
+	dict_foreign_set::const_iterator	it
+		= std::find_if(table->referenced_set.begin(),
+			       table->referenced_set.end(),
+			       dict_foreign_different_tables());
+
+	if (it == table->referenced_set.end()) {
+		return(DB_SUCCESS);
+	}
+
+	const dict_foreign_t*	foreign	= *it;
+	FILE*			ef	= dict_foreign_err_file;
+
+	ut_ad(foreign->foreign_table != table);
+	ut_ad(foreign->referenced_table == table);
+
+	/* We only allow discarding a referenced table if
+	FOREIGN_KEY_CHECKS is set to 0 */
+
+	mysql_mutex_lock(&dict_foreign_err_mutex);
+
+	rewind(ef);
+
+	ut_print_timestamp(ef);
+
+	fputs("  Cannot DISCARD table ", ef);
+	ut_print_name(ef, trx, table->name.m_name);
+	fputs("\n"
+	      "because it is referenced by ", ef);
+	ut_print_name(ef, trx, foreign->foreign_table_name);
+	putc('\n', ef);
+
+	mysql_mutex_unlock(&dict_foreign_err_mutex);
+
+	return(DB_CANNOT_DROP_CONSTRAINT);
+}
+
+/*********************************************************************//**
+Do the DISCARD TABLESPACE operation.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_discard_tablespace(
+/*===================*/
+	trx_t*		trx,	/*!< in/out: transaction handle */
+	dict_table_t*	table)	/*!< in/out: table to be discarded */
+{
+	dberr_t err;
+
+	/* How do we prevent crashes caused by ongoing operations on
+	the table? Old operations could try to access non-existent
+	pages. The SQL layer will block all DML on the table using MDL and a
+	DISCARD will not start unless all existing operations on the
+	table to be discarded are completed.
+
+	1) Acquire the data dictionary latch in X mode. This will
+	prevent any internal operations that are not covered by
+	MDL or InnoDB table locks.
+
+	2) Purge and rollback: we assign a new table id for the
+	table. Since purge and rollback look for the table based on
+	the table id, they see the table as 'dropped' and discard
+	their operations.
+
+	3) Insert buffer: we remove all entries for the tablespace in
+	the insert buffer tree. */
+
+	ibuf_delete_for_discarded_space(table->space_id);
+
+	table_id_t	new_id;
+
+	/* Set the TABLESPACE DISCARD flag in the table definition
+	on disk. */
+	err = row_import_update_discarded_flag(trx, table->id, true);
+
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	/* Update the index root pages in the system tables, on disk */
+	err = row_import_update_index_root(trx, table, true);
+
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	/* Drop all the FTS auxiliary tables. */
+	if (dict_table_has_fts_index(table)
+	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+
+		fts_drop_tables(trx, *table);
+	}
+
+	/* Assign a new space ID to the table definition so that purge
+	can ignore the changes. Update the system table on disk. */
+
+	err = row_mysql_table_id_reassign(table, trx, &new_id);
+
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	/* All persistent operations successful, update the
+	data dictionary memory cache. */
+
+	dict_table_change_id_in_cache(table, new_id);
+
+	dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+	if (index) index->clear_instant_alter();
+
+	/* Reset the root page numbers. */
+	for (; index; index = UT_LIST_GET_NEXT(indexes, index)) {
+		index->page = FIL_NULL;
+	}
+
+	/* If the tablespace did not already exist or we couldn't
+	write to it, we treat that as a successful DISCARD. It is
+	unusable anyway. */
+	return DB_SUCCESS;
+}
+
+/*********************************************************************//**
+Discards the tablespace of a table which stored in an .ibd file. Discarding
+means that this function renames the .ibd file and assigns a new table id for
+the table. Also the file_unreadable flag is set.
+@return error code or DB_SUCCESS */
+dberr_t row_discard_tablespace_for_mysql(dict_table_t *table, trx_t *trx)
+{
+  ut_ad(!is_system_tablespace(table->space_id));
+  ut_ad(!table->is_temporary());
+
+  const auto fts_exist = table->flags2 &
+    (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS);
+
+  dberr_t err;
+
+  if (fts_exist)
+  {
+    fts_optimize_remove_table(table);
+    purge_sys.stop_FTS(*table);
+    err= fts_lock_tables(trx, *table);
+    if (err != DB_SUCCESS)
+    {
+rollback:
+      if (fts_exist)
+      {
+        purge_sys.resume_FTS();
+        fts_optimize_add_table(table);
+      }
+      trx->rollback();
+      if (trx->dict_operation_lock_mode)
+        row_mysql_unlock_data_dictionary(trx);
+      return err;
+    }
+  }
+
+  row_mysql_lock_data_dictionary(trx);
+  trx->op_info = "discarding tablespace";
+  trx->dict_operation= true;
+
+  /* We serialize data dictionary operations with dict_sys.latch:
+  this is to avoid deadlocks during data dictionary operations */
+
+  err= row_discard_tablespace_foreign_key_checks(trx, table);
+  if (err != DB_SUCCESS)
+    goto rollback;
+
+  /* Note: The following cannot be rolled back. Rollback would see the
+  UPDATE of SYS_INDEXES.TABLE_ID as two operations: DELETE and INSERT.
+  It would invoke btr_free_if_exists() when rolling back the INSERT,
+  effectively dropping all indexes of the table. Furthermore, calls like
+  ibuf_delete_for_discarded_space() are already discarding data
+  before the transaction is committed.
+
+  It would be better to remove the integrity-breaking
+  ALTER TABLE...DISCARD TABLESPACE operation altogether. */
+  table->file_unreadable= true;
+  table->space= nullptr;
+  table->flags2|= DICT_TF2_DISCARDED;
+  err= row_discard_tablespace(trx, table);
+  DBUG_EXECUTE_IF("ib_discard_before_commit_crash",
+                  log_buffer_flush_to_disk(); DBUG_SUICIDE(););
+  /* FTS_ tables may be deleted */
+  std::vector<pfs_os_file_t> deleted;
+  trx->commit(deleted);
+  const auto space_id= table->space_id;
+  pfs_os_file_t d= fil_delete_tablespace(space_id);
+  DBUG_EXECUTE_IF("ib_discard_after_commit_crash", DBUG_SUICIDE(););
+  row_mysql_unlock_data_dictionary(trx);
+
+  if (d != OS_FILE_CLOSED)
+    os_file_close(d);
+  for (pfs_os_file_t d : deleted)
+    os_file_close(d);
+
+  if (fts_exist)
+    purge_sys.resume_FTS();
+
+  ibuf_delete_for_discarded_space(space_id);
+  buf_flush_remove_pages(space_id);
+  trx->op_info= "";
+  return err;
+}
+
+/****************************************************************//**
+Delete a single constraint.
+@return error code or DB_SUCCESS */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_delete_constraint_low(
+/*======================*/
+	const char*	id,		/*!< in: constraint id */
+	trx_t*		trx)		/*!< in: transaction handle */
+{
+	pars_info_t*	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "id", id);
+
+	return(que_eval_sql(info,
+			    "PROCEDURE DELETE_CONSTRAINT () IS\n"
+			    "BEGIN\n"
+			    "DELETE FROM SYS_FOREIGN_COLS WHERE ID = :id;\n"
+			    "DELETE FROM SYS_FOREIGN WHERE ID = :id;\n"
+			    "END;\n", trx));
+}
+
+/****************************************************************//**
+Delete a single constraint.
+@return error code or DB_SUCCESS */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_delete_constraint(
+/*==================*/
+	const char*	id,		/*!< in: constraint id */
+	const char*	database_name,	/*!< in: database name, with the
+					trailing '/' */
+	mem_heap_t*	heap,		/*!< in: memory heap */
+	trx_t*		trx)		/*!< in: transaction handle */
+{
+	dberr_t	err;
+
+	/* New format constraints have ids <databasename>/<constraintname>. */
+	err = row_delete_constraint_low(
+		mem_heap_strcat(heap, database_name, id), trx);
+
+	if ((err == DB_SUCCESS) && !strchr(id, '/')) {
+		/* Old format < 4.0.18 constraints have constraint ids
+		NUMBER_NUMBER. We only try deleting them if the
+		constraint name does not contain a '/' character, otherwise
+		deleting a new format constraint named 'foo/bar' from
+		database 'baz' would remove constraint 'bar' from database
+		'foo', if it existed. */
+
+		err = row_delete_constraint_low(id, trx);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Renames a table for MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+row_rename_table_for_mysql(
+/*=======================*/
+	const char*	old_name,	/*!< in: old table name */
+	const char*	new_name,	/*!< in: new table name */
+	trx_t*		trx,		/*!< in/out: transaction */
+	bool		use_fk)		/*!< in: whether to parse and enforce
+					FOREIGN KEY constraints */
+{
+	dict_table_t*	table			= NULL;
+	dberr_t		err			= DB_ERROR;
+	mem_heap_t*	heap			= NULL;
+	const char**	constraints_to_drop	= NULL;
+	ulint		n_constraints_to_drop	= 0;
+	ibool		old_is_tmp, new_is_tmp;
+	pars_info_t*	info			= NULL;
+
+	ut_a(old_name != NULL);
+	ut_a(new_name != NULL);
+	ut_ad(trx->state == TRX_STATE_ACTIVE);
+	ut_ad(trx->dict_operation_lock_mode);
+
+	if (high_level_read_only) {
+		return(DB_READ_ONLY);
+	}
+
+	trx->op_info = "renaming table";
+
+	old_is_tmp = dict_table_t::is_temporary_name(old_name);
+	new_is_tmp = dict_table_t::is_temporary_name(new_name);
+
+	table = dict_table_open_on_name(old_name, true,
+					DICT_ERR_IGNORE_FK_NOKEY);
+
+	/* MariaDB partition engine hard codes the file name
+	separator as "#P#" and "#SP#". The text case is fixed even if
+	lower_case_table_names is set to 1 or 2. InnoDB always
+	normalises file names to lower case on Windows, this
+	can potentially cause problems when copying/moving
+	tables between platforms.
+
+	1) If boot against an installation from Windows
+	platform, then its partition table name could
+	be all be in lower case in system tables. So we
+	will need to check lower case name when load table.
+
+	2) If  we boot an installation from other case
+	sensitive platform in Windows, we might need to
+	check the existence of table name without lowering
+	case them in the system table. */
+	if (!table && lower_case_table_names == 1
+	    && strstr(old_name, table_name_t::part_suffix)) {
+		char par_case_name[MAX_FULL_NAME_LEN + 1];
+#ifndef _WIN32
+		/* Check for the table using lower
+		case name, including the partition
+		separator "P" */
+		memcpy(par_case_name, old_name,
+			strlen(old_name));
+		par_case_name[strlen(old_name)] = 0;
+		innobase_casedn_str(par_case_name);
+#else
+		/* On Windows platfrom, check
+		whether there exists table name in
+		system table whose name is
+		not being normalized to lower case */
+		normalize_table_name_c_low(
+			par_case_name, old_name, FALSE);
+#endif
+		table = dict_table_open_on_name(par_case_name, true,
+						DICT_ERR_IGNORE_FK_NOKEY);
+	}
+
+	if (!table) {
+		err = DB_TABLE_NOT_FOUND;
+		goto funct_exit;
+	}
+
+	ut_ad(!table->is_temporary());
+
+	if (!table->is_readable() && !table->space
+	    && !(table->flags2 & DICT_TF2_DISCARDED)) {
+
+		err = DB_TABLE_NOT_FOUND;
+
+		ib::error() << "Table " << old_name << " does not have an .ibd"
+			" file in the database directory. "
+			<< TROUBLESHOOTING_MSG;
+
+		goto funct_exit;
+
+	} else if (use_fk && !old_is_tmp && new_is_tmp) {
+		/* MySQL is doing an ALTER TABLE command and it renames the
+		original table to a temporary table name. We want to preserve
+		the original foreign key constraint definitions despite the
+		name change. An exception is those constraints for which
+		the ALTER TABLE contained DROP FOREIGN KEY <foreign key id>.*/
+
+		heap = mem_heap_create(100);
+
+		err = dict_foreign_parse_drop_constraints(
+			heap, trx, table, &n_constraints_to_drop,
+			&constraints_to_drop);
+
+		if (err != DB_SUCCESS) {
+			goto funct_exit;
+		}
+	}
+
+	err = trx_undo_report_rename(trx, table);
+
+	if (err != DB_SUCCESS) {
+		goto funct_exit;
+	}
+
+	/* We use the private SQL parser of Innobase to generate the query
+	graphs needed in updating the dictionary data from system tables. */
+
+	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "new_table_name", new_name);
+	pars_info_add_str_literal(info, "old_table_name", old_name);
+
+	err = que_eval_sql(info,
+			   "PROCEDURE RENAME_TABLE () IS\n"
+			   "BEGIN\n"
+			   "UPDATE SYS_TABLES"
+			   " SET NAME = :new_table_name\n"
+			   " WHERE NAME = :old_table_name;\n"
+			   "END;\n", trx);
+
+	if (err != DB_SUCCESS) {
+		// Assume the caller guarantees destination name doesn't exist.
+		ut_ad(err != DB_DUPLICATE_KEY);
+		goto rollback_and_exit;
+	}
+
+	if (!new_is_tmp) {
+		/* Rename all constraints. */
+		char	new_table_name[MAX_TABLE_NAME_LEN + 1];
+		char	old_table_utf8[MAX_TABLE_NAME_LEN + 1];
+		uint	errors = 0;
+
+		strncpy(old_table_utf8, old_name, MAX_TABLE_NAME_LEN);
+		old_table_utf8[MAX_TABLE_NAME_LEN] = '\0';
+		innobase_convert_to_system_charset(
+			strchr(old_table_utf8, '/') + 1,
+			strchr(old_name, '/') +1,
+			MAX_TABLE_NAME_LEN, &errors);
+
+		if (errors) {
+			/* Table name could not be converted from charset
+			my_charset_filename to UTF-8. This means that the
+			table name is already in UTF-8 (#mysql#50). */
+			strncpy(old_table_utf8, old_name, MAX_TABLE_NAME_LEN);
+			old_table_utf8[MAX_TABLE_NAME_LEN] = '\0';
+		}
+
+		info = pars_info_create();
+
+		pars_info_add_str_literal(info, "new_table_name", new_name);
+		pars_info_add_str_literal(info, "old_table_name", old_name);
+		pars_info_add_str_literal(info, "old_table_name_utf8",
+					  old_table_utf8);
+
+		strncpy(new_table_name, new_name, MAX_TABLE_NAME_LEN);
+		new_table_name[MAX_TABLE_NAME_LEN] = '\0';
+		innobase_convert_to_system_charset(
+			strchr(new_table_name, '/') + 1,
+			strchr(new_name, '/') +1,
+			MAX_TABLE_NAME_LEN, &errors);
+
+		if (errors) {
+			/* Table name could not be converted from charset
+			my_charset_filename to UTF-8. This means that the
+			table name is already in UTF-8 (#mysql#50). */
+			strncpy(new_table_name, new_name, MAX_TABLE_NAME_LEN);
+			new_table_name[MAX_TABLE_NAME_LEN] = '\0';
+		}
+
+		pars_info_add_str_literal(info, "new_table_utf8", new_table_name);
+
+		err = que_eval_sql(
+			info,
+			"PROCEDURE RENAME_CONSTRAINT_IDS () IS\n"
+			"gen_constr_prefix CHAR;\n"
+			"new_db_name CHAR;\n"
+			"foreign_id CHAR;\n"
+			"new_foreign_id CHAR;\n"
+			"old_db_name_len INT;\n"
+			"old_t_name_len INT;\n"
+			"new_db_name_len INT;\n"
+			"id_len INT;\n"
+			"offset INT;\n"
+			"found INT;\n"
+			"BEGIN\n"
+			"found := 1;\n"
+			"old_db_name_len := INSTR(:old_table_name, '/')-1;\n"
+			"new_db_name_len := INSTR(:new_table_name, '/')-1;\n"
+			"new_db_name := SUBSTR(:new_table_name, 0,\n"
+			"                      new_db_name_len);\n"
+			"old_t_name_len := LENGTH(:old_table_name);\n"
+			"gen_constr_prefix := CONCAT(:old_table_name_utf8,\n"
+			"                            '_ibfk_');\n"
+			"WHILE found = 1 LOOP\n"
+			"       SELECT ID INTO foreign_id\n"
+			"        FROM SYS_FOREIGN\n"
+			"        WHERE FOR_NAME = :old_table_name\n"
+			"         AND TO_BINARY(FOR_NAME)\n"
+			"           = TO_BINARY(:old_table_name)\n"
+			"         LOCK IN SHARE MODE;\n"
+			"       IF (SQL % NOTFOUND) THEN\n"
+			"        found := 0;\n"
+			"       ELSE\n"
+			"        UPDATE SYS_FOREIGN\n"
+			"        SET FOR_NAME = :new_table_name\n"
+			"         WHERE ID = foreign_id;\n"
+			"        id_len := LENGTH(foreign_id);\n"
+			"        IF (INSTR(foreign_id, '/') > 0) THEN\n"
+			"               IF (INSTR(foreign_id,\n"
+			"                         gen_constr_prefix) > 0)\n"
+			"               THEN\n"
+                        "                offset := INSTR(foreign_id, '_ibfk_') - 1;\n"
+			"                new_foreign_id :=\n"
+			"                CONCAT(:new_table_utf8,\n"
+			"                SUBSTR(foreign_id, offset,\n"
+			"                       id_len - offset));\n"
+			"               ELSE\n"
+			"                new_foreign_id :=\n"
+			"                CONCAT(new_db_name,\n"
+			"                SUBSTR(foreign_id,\n"
+			"                       old_db_name_len,\n"
+			"                       id_len - old_db_name_len));\n"
+			"               END IF;\n"
+			"               UPDATE SYS_FOREIGN\n"
+			"                SET ID = new_foreign_id\n"
+			"                WHERE ID = foreign_id;\n"
+			"               UPDATE SYS_FOREIGN_COLS\n"
+			"                SET ID = new_foreign_id\n"
+			"                WHERE ID = foreign_id;\n"
+			"        END IF;\n"
+			"       END IF;\n"
+			"END LOOP;\n"
+			"UPDATE SYS_FOREIGN SET REF_NAME = :new_table_name\n"
+			"WHERE REF_NAME = :old_table_name\n"
+			"  AND TO_BINARY(REF_NAME)\n"
+			"    = TO_BINARY(:old_table_name);\n"
+			"END;\n", trx);
+
+	} else if (n_constraints_to_drop > 0) {
+		/* Drop some constraints of tmp tables. */
+
+		ulint	db_name_len = dict_get_db_name_len(old_name) + 1;
+		char*	db_name = mem_heap_strdupl(heap, old_name,
+						   db_name_len);
+		ulint	i;
+
+		for (i = 0; i < n_constraints_to_drop; i++) {
+			err = row_delete_constraint(constraints_to_drop[i],
+						    db_name, heap, trx);
+
+			if (err != DB_SUCCESS) {
+				break;
+			}
+		}
+	}
+
+	if (err == DB_SUCCESS
+	    && (dict_table_has_fts_index(table)
+	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID))
+	    && !dict_tables_have_same_db(old_name, new_name)) {
+		err = fts_rename_aux_tables(table, new_name, trx);
+	}
+
+	switch (err) {
+	case DB_DUPLICATE_KEY:
+		ib::error() << "Table rename might cause two"
+			" FOREIGN KEY constraints to have the same"
+			" internal name in case-insensitive comparison.";
+		ib::info() << TROUBLESHOOTING_MSG;
+		/* fall through */
+	rollback_and_exit:
+	default:
+		trx->error_state = DB_SUCCESS;
+		trx->rollback();
+		trx->error_state = DB_SUCCESS;
+		break;
+	case DB_SUCCESS:
+		DEBUG_SYNC_C("innodb_rename_in_cache");
+		/* The following call will also rename the .ibd file */
+		err = dict_table_rename_in_cache(
+			table, span<const char>{new_name,strlen(new_name)},
+			false);
+		if (err != DB_SUCCESS) {
+			goto rollback_and_exit;
+		}
+
+		/* In case of copy alter, template db_name and
+		table_name should be renamed only for newly
+		created table. */
+		if (table->vc_templ != NULL && !new_is_tmp) {
+			innobase_rename_vc_templ(table);
+		}
+
+		/* We only want to switch off some of the type checking in
+		an ALTER TABLE, not in a RENAME. */
+		dict_names_t	fk_tables;
+
+		err = dict_load_foreigns(
+			new_name, nullptr, trx->id,
+			!old_is_tmp || trx->check_foreigns,
+			use_fk
+			? DICT_ERR_IGNORE_NONE
+			: DICT_ERR_IGNORE_FK_NOKEY,
+			fk_tables);
+
+		if (err != DB_SUCCESS) {
+			if (old_is_tmp) {
+				/* In case of copy alter, ignore the
+				loading of foreign key constraint
+				when foreign_key_check is disabled */
+				ib::error_or_warn(trx->check_foreigns)
+					<< "In ALTER TABLE "
+					<< ut_get_name(trx, new_name)
+					<< " has or is referenced in foreign"
+					" key constraints which are not"
+					" compatible with the new table"
+					" definition.";
+				if (!trx->check_foreigns) {
+					err = DB_SUCCESS;
+					break;
+				}
+			} else {
+				ib::error() << "In RENAME TABLE table "
+					<< ut_get_name(trx, new_name)
+					<< " is referenced in foreign key"
+					" constraints which are not compatible"
+					" with the new table definition.";
+			}
+
+			goto rollback_and_exit;
+		}
+
+		/* Check whether virtual column or stored column affects
+		the foreign key constraint of the table. */
+		if (dict_foreigns_has_s_base_col(table->foreign_set, table)) {
+			err = DB_NO_FK_ON_S_BASE_COL;
+			goto rollback_and_exit;
+		}
+
+		/* Fill the virtual column set in foreign when
+		the table undergoes copy alter operation. */
+		dict_mem_table_free_foreign_vcol_set(table);
+		dict_mem_table_fill_foreign_vcol_set(table);
+
+		while (!fk_tables.empty()) {
+			const char *f = fk_tables.front();
+			dict_sys.load_table({f, strlen(f)});
+			fk_tables.pop_front();
+		}
+
+		table->data_dir_path= NULL;
+	}
+
+funct_exit:
+	if (table) {
+		table->release();
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc
new file mode 100644
index 00000000..4756cc37
--- /dev/null
+++ b/storage/innobase/row/row0purge.cc
@@ -0,0 +1,1304 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0purge.cc
+Purge obsolete records
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0purge.h"
+#include "btr0cur.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "dict0crea.h"
+#include "dict0stats.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "row0vers.h"
+#include "row0mysql.h"
+#include "log0log.h"
+#include "srv0mon.h"
+#include "srv0start.h"
+#include "handler.h"
+#include "ha_innodb.h"
+#include "fil0fil.h"
+#include "debug_sync.h"
+#include <mysql/service_thd_mdl.h>
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***********************************************************//**
+Repositions the pcur in the purge node on the clustered index record,
+if found. If the record is not found, close pcur.
+@return TRUE if the record was found */
+static
+ibool
+row_purge_reposition_pcur(
+/*======================*/
+	btr_latch_mode	mode,	/*!< in: latching mode */
+	purge_node_t*	node,	/*!< in: row purge node */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	if (node->found_clust) {
+		ut_ad(node->validate_pcur());
+
+		node->found_clust =
+		  node->pcur.restore_position(mode, mtr) ==
+		    btr_pcur_t::SAME_ALL;
+
+	} else {
+		node->found_clust = row_search_on_row_ref(
+			&node->pcur, mode, node->table, node->ref, mtr);
+
+		if (node->found_clust) {
+			btr_pcur_store_position(&node->pcur, mtr);
+		}
+	}
+
+	/* Close the current cursor if we fail to position it correctly. */
+	if (!node->found_clust) {
+		btr_pcur_close(&node->pcur);
+	}
+
+	return(node->found_clust);
+}
+
+/***********************************************************//**
+Removes a delete marked clustered index record if possible.
+@retval true if the row was not found, or it was successfully removed
+@retval false if the row was modified after the delete marking */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_purge_remove_clust_if_poss_low(
+/*===============================*/
+	purge_node_t*	node,	/*!< in/out: row purge node */
+	btr_latch_mode	mode)	/*!< in: BTR_MODIFY_LEAF or BTR_PURGE_TREE */
+{
+	dict_index_t* index = dict_table_get_first_index(node->table);
+	table_id_t table_id = 0;
+	index_id_t index_id = 0;
+	dict_table_t *table = nullptr;
+	pfs_os_file_t f = OS_FILE_CLOSED;
+
+	if (table_id) {
+retry:
+		dict_sys.lock(SRW_LOCK_CALL);
+		table = dict_sys.find_table(table_id);
+		if (!table) {
+			dict_sys.unlock();
+		} else if (table->n_rec_locks) {
+			for (dict_index_t* ind = UT_LIST_GET_FIRST(
+				     table->indexes); ind;
+			     ind = UT_LIST_GET_NEXT(indexes, ind)) {
+				if (ind->id == index_id) {
+					lock_discard_for_index(*ind);
+				}
+			}
+		}
+	}
+	mtr_t mtr;
+	mtr.start();
+	index->set_modified(mtr);
+	log_free_check();
+	bool success = true;
+
+	if (!row_purge_reposition_pcur(mode, node, &mtr)) {
+		/* The record was already removed. */
+removed:
+		mtr.commit();
+close_and_exit:
+		if (table) {
+			dict_sys.unlock();
+		}
+		return success;
+	}
+
+	if (node->table->id == DICT_INDEXES_ID) {
+		/* If this is a record of the SYS_INDEXES table, then
+		we have to free the file segments of the index tree
+		associated with the index */
+		if (!table_id) {
+			const rec_t* rec = btr_pcur_get_rec(&node->pcur);
+
+			table_id = mach_read_from_8(rec);
+			index_id = mach_read_from_8(rec + 8);
+			if (table_id) {
+				mtr.commit();
+				goto retry;
+			}
+			ut_ad("corrupted SYS_INDEXES record" == 0);
+		}
+
+		const uint32_t space_id = dict_drop_index_tree(
+			&node->pcur, nullptr, &mtr);
+		if (space_id) {
+			if (table) {
+				if (table->get_ref_count() == 0) {
+					dict_sys.remove(table);
+				} else if (table->space_id == space_id) {
+					table->space = nullptr;
+					table->file_unreadable = true;
+				}
+				dict_sys.unlock();
+				table = nullptr;
+			}
+			f = fil_delete_tablespace(space_id);
+		}
+
+		mtr.commit();
+
+		if (table) {
+			dict_sys.unlock();
+			table = nullptr;
+		}
+
+		if (space_id) {
+			ibuf_delete_for_discarded_space(space_id);
+		}
+
+		mtr.start();
+		index->set_modified(mtr);
+
+		if (!row_purge_reposition_pcur(mode, node, &mtr)) {
+			goto removed;
+		}
+	}
+
+	rec_t* rec = btr_pcur_get_rec(&node->pcur);
+	rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets_);
+	mem_heap_t* heap = NULL;
+	rec_offs* offsets = rec_get_offsets(rec, index, offsets_,
+					    index->n_core_fields,
+					    ULINT_UNDEFINED, &heap);
+
+	if (node->roll_ptr != row_get_rec_roll_ptr(rec, index, offsets)) {
+		/* Someone else has modified the record later: do not remove */
+		goto func_exit;
+	}
+
+	ut_ad(rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+	/* In delete-marked records, DB_TRX_ID must
+	always refer to an existing undo log record. */
+	ut_ad(row_get_rec_trx_id(rec, index, offsets));
+
+	if (mode == BTR_MODIFY_LEAF) {
+		success = DB_FAIL != btr_cur_optimistic_delete(
+			btr_pcur_get_btr_cur(&node->pcur), 0, &mtr);
+	} else {
+		dberr_t	err;
+		ut_ad(mode == BTR_PURGE_TREE);
+		btr_cur_pessimistic_delete(
+			&err, FALSE, btr_pcur_get_btr_cur(&node->pcur), 0,
+			false, &mtr);
+		success = err == DB_SUCCESS;
+	}
+
+func_exit:
+	if (heap) {
+		mem_heap_free(heap);
+	}
+
+	/* Persistent cursor is closed if reposition fails. */
+	if (node->found_clust) {
+		btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+	} else {
+		mtr_commit(&mtr);
+	}
+
+	goto close_and_exit;
+}
+
+/***********************************************************//**
+Removes a clustered index record if it has not been modified after the delete
+marking.
+@retval true if the row was not found, or it was successfully removed
+@retval false the purge needs to be suspended because of running out
+of file space. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_purge_remove_clust_if_poss(
+/*===========================*/
+	purge_node_t*	node)	/*!< in/out: row purge node */
+{
+	if (row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_LEAF)) {
+		return(true);
+	}
+
+	for (ulint n_tries = 0;
+	     n_tries < BTR_CUR_RETRY_DELETE_N_TIMES;
+	     n_tries++) {
+		if (row_purge_remove_clust_if_poss_low(node, BTR_PURGE_TREE)) {
+			return(true);
+		}
+
+		std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME);
+	}
+
+	return(false);
+}
+
+/** Determines if it is possible to remove a secondary index entry.
+Removal is possible if the secondary index entry does not refer to any
+not delete marked version of a clustered index record where DB_TRX_ID
+is newer than the purge view.
+
+NOTE: This function should only be called by the purge thread, only
+while holding a latch on the leaf page of the secondary index entry
+(or keeping the buffer pool watch on the page).  It is possible that
+this function first returns true and then false, if a user transaction
+inserts a record that the secondary index entry would refer to.
+However, in that case, the user transaction would also re-insert the
+secondary index entry after purge has removed it and released the leaf
+page latch.
+@param[in,out]	node		row purge node
+@param[in]	index		secondary index
+@param[in]	entry		secondary index entry
+@param[in,out]	sec_pcur	secondary index cursor or NULL
+				if it is called for purge buffering
+				operation.
+@param[in,out]	sec_mtr		mini-transaction which holds
+				secondary index entry or NULL if it is
+				called for purge buffering operation.
+@param[in]	is_tree		true=pessimistic purge,
+				false=optimistic (leaf-page only)
+@return true if the secondary index record can be purged */
+bool
+row_purge_poss_sec(
+	purge_node_t*	node,
+	dict_index_t*	index,
+	const dtuple_t*	entry,
+	btr_pcur_t*	sec_pcur,
+	mtr_t*		sec_mtr,
+	bool		is_tree)
+{
+	bool	can_delete;
+	mtr_t	mtr;
+
+	ut_ad(!dict_index_is_clust(index));
+
+	mtr_start(&mtr);
+
+	can_delete = !row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr)
+		|| !row_vers_old_has_index_entry(true,
+						 btr_pcur_get_rec(&node->pcur),
+						 &mtr, index, entry,
+						 node->roll_ptr, node->trx_id);
+
+	/* Persistent cursor is closed if reposition fails. */
+	if (node->found_clust) {
+		btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+	} else {
+		mtr.commit();
+	}
+
+	ut_ad(mtr.has_committed());
+
+	return can_delete;
+}
+
+/***************************************************************
+Removes a secondary index entry if possible, by modifying the
+index tree.  Does not try to buffer the delete.
+@return TRUE if success or if not found */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+ibool
+row_purge_remove_sec_if_poss_tree(
+/*==============================*/
+	purge_node_t*	node,	/*!< in: row purge node */
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	entry)	/*!< in: index entry */
+{
+	btr_pcur_t		pcur;
+	ibool			success	= TRUE;
+	dberr_t			err;
+	mtr_t			mtr;
+
+	log_free_check();
+	mtr.start();
+	index->set_modified(mtr);
+	pcur.btr_cur.page_cur.index = index;
+
+	if (index->is_spatial()) {
+		if (!rtr_search(entry, BTR_PURGE_TREE, &pcur, &mtr)) {
+			goto found;
+		}
+		goto func_exit;
+	}
+
+	switch (row_search_index_entry(entry, BTR_PURGE_TREE, &pcur, &mtr)) {
+	case ROW_NOT_FOUND:
+		/* Not found.  This is a legitimate condition.  In a
+		rollback, InnoDB will remove secondary recs that would
+		be purged anyway.  Then the actual purge will not find
+		the secondary index record.  Also, the purge itself is
+		eager: if it comes to consider a secondary index
+		record, and notices it does not need to exist in the
+		index, it will remove it.  Then if/when the purge
+		comes to consider the secondary index record a second
+		time, it will not exist any more in the index. */
+
+		/* fputs("PURGE:........sec entry not found\n", stderr); */
+		/* dtuple_print(stderr, entry); */
+		goto func_exit;
+	case ROW_FOUND:
+		break;
+	case ROW_BUFFERED:
+	case ROW_NOT_DELETED_REF:
+		/* These are invalid outcomes, because the mode passed
+		to row_search_index_entry() did not include any of the
+		flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+		ut_error;
+	}
+
+	/* We should remove the index record if no later version of the row,
+	which cannot be purged yet, requires its existence. If some requires,
+	we should do nothing. */
+
+found:
+	if (row_purge_poss_sec(node, index, entry, &pcur, &mtr, true)) {
+
+		/* Remove the index record, which should have been
+		marked for deletion. */
+		if (!rec_get_deleted_flag(btr_cur_get_rec(
+						btr_pcur_get_btr_cur(&pcur)),
+					  dict_table_is_comp(index->table))) {
+			ib::error()
+				<< "tried to purge non-delete-marked record"
+				" in index " << index->name
+				<< " of table " << index->table->name
+				<< ": tuple: " << *entry
+				<< ", record: " << rec_index_print(
+					btr_cur_get_rec(
+						btr_pcur_get_btr_cur(&pcur)),
+					index);
+
+			ut_ad(0);
+
+			goto func_exit;
+		}
+
+		btr_cur_pessimistic_delete(&err, FALSE,
+					   btr_pcur_get_btr_cur(&pcur),
+					   0, false, &mtr);
+		switch (UNIV_EXPECT(err, DB_SUCCESS)) {
+		case DB_SUCCESS:
+			break;
+		case DB_OUT_OF_FILE_SPACE:
+			success = FALSE;
+			break;
+		default:
+			ut_error;
+		}
+	}
+
+func_exit:
+	btr_pcur_close(&pcur); // FIXME: need this?
+	mtr.commit();
+
+	return(success);
+}
+
+/***************************************************************
+Removes a secondary index entry without modifying the index tree,
+if possible.
+@retval true if success or if not found
+@retval false if row_purge_remove_sec_if_poss_tree() should be invoked */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_purge_remove_sec_if_poss_leaf(
+/*==============================*/
+	purge_node_t*	node,	/*!< in: row purge node */
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	entry)	/*!< in: index entry */
+{
+	mtr_t			mtr;
+	btr_pcur_t		pcur;
+	bool			success	= true;
+
+	log_free_check();
+	ut_ad(index->table == node->table);
+	ut_ad(!index->table->is_temporary());
+	mtr.start();
+	index->set_modified(mtr);
+
+	pcur.btr_cur.page_cur.index = index;
+
+	/* Set the purge node for the call to row_purge_poss_sec(). */
+	pcur.btr_cur.purge_node = node;
+	if (index->is_spatial()) {
+		pcur.btr_cur.thr = NULL;
+		if (!rtr_search(entry, BTR_MODIFY_LEAF, &pcur, &mtr)) {
+			goto found;
+		}
+		goto func_exit;
+	}
+
+	/* Set the query thread, so that ibuf_insert_low() will be
+	able to invoke thd_get_trx(). */
+	pcur.btr_cur.thr = static_cast<que_thr_t*>(que_node_get_parent(node));
+
+	switch (row_search_index_entry(entry, index->has_virtual()
+				       ? BTR_MODIFY_LEAF : BTR_PURGE_LEAF,
+				       &pcur, &mtr)) {
+	case ROW_FOUND:
+found:
+		/* Before attempting to purge a record, check
+		if it is safe to do so. */
+		if (row_purge_poss_sec(node, index, entry, &pcur, &mtr, false)) {
+			btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+			/* Only delete-marked records should be purged. */
+			if (!rec_get_deleted_flag(
+				btr_cur_get_rec(btr_cur),
+				dict_table_is_comp(index->table))) {
+
+				ib::error()
+					<< "tried to purge non-delete-marked"
+					" record" " in index " << index->name
+					<< " of table " << index->table->name
+					<< ": tuple: " << *entry
+					<< ", record: "
+					<< rec_index_print(
+						btr_cur_get_rec(btr_cur),
+						index);
+				mtr.commit();
+				dict_set_corrupted(index, "purge");
+				goto cleanup;
+			}
+
+			if (index->is_spatial()) {
+				const buf_block_t* block = btr_cur_get_block(
+					btr_cur);
+
+				if (block->page.id().page_no()
+				    != index->page
+				    && page_get_n_recs(block->page.frame) < 2
+				    && !lock_test_prdt_page_lock(
+					    btr_cur->rtr_info
+					    && btr_cur->rtr_info->thr
+					    ? thr_get_trx(
+						    btr_cur->rtr_info->thr)
+					    : nullptr,
+					    block->page.id())) {
+					/* this is the last record on page,
+					and it has a "page" lock on it,
+					which mean search is still depending
+					on it, so do not delete */
+					DBUG_LOG("purge",
+						 "skip purging last"
+						 " record on page "
+						 << block->page.id());
+					goto func_exit;
+				}
+			}
+
+			success = btr_cur_optimistic_delete(btr_cur, 0, &mtr)
+				!= DB_FAIL;
+		}
+
+		/* (The index entry is still needed,
+		or the deletion succeeded) */
+		/* fall through */
+	case ROW_NOT_DELETED_REF:
+		/* The index entry is still needed. */
+	case ROW_BUFFERED:
+		/* The deletion was buffered. */
+	case ROW_NOT_FOUND:
+		/* The index entry does not exist, nothing to do. */
+func_exit:
+		mtr.commit();
+cleanup:
+		btr_pcur_close(&pcur); // FIXME: do we need these? when is btr_cur->rtr_info set?
+		return(success);
+	}
+
+	ut_error;
+	return(false);
+}
+
+/***********************************************************//**
+Removes a secondary index entry if possible. */
+UNIV_INLINE MY_ATTRIBUTE((nonnull(1,2)))
+void
+row_purge_remove_sec_if_poss(
+/*=========================*/
+	purge_node_t*	node,	/*!< in: row purge node */
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	entry)	/*!< in: index entry */
+{
+	ibool	success;
+	ulint	n_tries		= 0;
+
+	/*	fputs("Purge: Removing secondary record\n", stderr); */
+
+	if (!entry) {
+		/* The node->row must have lacked some fields of this
+		index. This is possible when the undo log record was
+		written before this index was created. */
+		return;
+	}
+
+	if (row_purge_remove_sec_if_poss_leaf(node, index, entry)) {
+
+		return;
+	}
+retry:
+	success = row_purge_remove_sec_if_poss_tree(node, index, entry);
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+		n_tries++;
+
+		std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME);
+
+		goto retry;
+	}
+
+	ut_a(success);
+}
+
+/***********************************************************//**
+Purges a delete marking of a record.
+@retval true if the row was not found, or it was successfully removed
+@retval false the purge needs to be suspended because of
+running out of file space */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_purge_del_mark(
+/*===============*/
+	purge_node_t*	node)	/*!< in/out: row purge node */
+{
+  if (node->index)
+  {
+    mem_heap_t *heap= mem_heap_create(1024);
+
+    do
+    {
+      if (node->index->type & (DICT_FTS | DICT_CORRUPT))
+        continue;
+      if (!node->index->is_committed())
+        continue;
+      dtuple_t* entry= row_build_index_entry_low(node->row, nullptr,
+                                                 node->index, heap,
+                                                 ROW_BUILD_FOR_PURGE);
+      row_purge_remove_sec_if_poss(node, node->index, entry);
+      mem_heap_empty(heap);
+    }
+    while ((node->index= dict_table_get_next_index(node->index)));
+
+    mem_heap_free(heap);
+  }
+
+  bool result= row_purge_remove_clust_if_poss(node);
+
+#ifdef ENABLED_DEBUG_SYNC
+  DBUG_EXECUTE_IF("enable_row_purge_del_mark_exit_sync_point",
+                  debug_sync_set_action
+                  (current_thd,
+                   STRING_WITH_LEN("now SIGNAL row_purge_del_mark_finished"));
+                  );
+#endif
+
+  return result;
+}
+
+/** Reset DB_TRX_ID, DB_ROLL_PTR of a clustered index record
+whose old history can no longer be observed.
+@param[in,out]	node	purge node
+@param[in,out]	mtr	mini-transaction (will be started and committed) */
+static void row_purge_reset_trx_id(purge_node_t* node, mtr_t* mtr)
+{
+	/* Reset DB_TRX_ID, DB_ROLL_PTR for old records. */
+	mtr->start();
+
+	if (row_purge_reposition_pcur(BTR_MODIFY_LEAF, node, mtr)) {
+		dict_index_t*	index = dict_table_get_first_index(
+			node->table);
+		ulint	trx_id_pos = index->n_uniq ? index->n_uniq : 1;
+		rec_t*	rec = btr_pcur_get_rec(&node->pcur);
+		mem_heap_t*	heap = NULL;
+		/* Reserve enough offsets for the PRIMARY KEY and 2 columns
+		so that we can access DB_TRX_ID, DB_ROLL_PTR. */
+		rec_offs offsets_[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2];
+		rec_offs_init(offsets_);
+		rec_offs*	offsets = rec_get_offsets(
+			rec, index, offsets_, index->n_core_fields,
+			trx_id_pos + 2, &heap);
+		ut_ad(heap == NULL);
+
+		ut_ad(dict_index_get_nth_field(index, trx_id_pos)
+		      ->col->mtype == DATA_SYS);
+		ut_ad(dict_index_get_nth_field(index, trx_id_pos)
+		      ->col->prtype == (DATA_TRX_ID | DATA_NOT_NULL));
+		ut_ad(dict_index_get_nth_field(index, trx_id_pos + 1)
+		      ->col->mtype == DATA_SYS);
+		ut_ad(dict_index_get_nth_field(index, trx_id_pos + 1)
+		      ->col->prtype == (DATA_ROLL_PTR | DATA_NOT_NULL));
+
+		/* Only update the record if DB_ROLL_PTR matches (the
+		record has not been modified after this transaction
+		became purgeable) */
+		if (node->roll_ptr
+		    == row_get_rec_roll_ptr(rec, index, offsets)) {
+			ut_ad(!rec_get_deleted_flag(
+					rec, rec_offs_comp(offsets))
+			      || rec_is_alter_metadata(rec, *index));
+			DBUG_LOG("purge", "reset DB_TRX_ID="
+				 << ib::hex(row_get_rec_trx_id(
+						    rec, index, offsets)));
+
+			index->set_modified(*mtr);
+			buf_block_t* block = btr_pcur_get_block(&node->pcur);
+			if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+				page_zip_write_trx_id_and_roll_ptr(
+					block, rec, offsets, trx_id_pos,
+					0, 1ULL << ROLL_PTR_INSERT_FLAG_POS,
+					mtr);
+			} else {
+				ulint	len;
+				byte*	ptr = rec_get_nth_field(
+					rec, offsets, trx_id_pos, &len);
+				ut_ad(len == DATA_TRX_ID_LEN);
+				size_t offs = page_offset(ptr);
+				mtr->memset(block, offs, DATA_TRX_ID_LEN, 0);
+				offs += DATA_TRX_ID_LEN;
+				mtr->write<1,mtr_t::MAYBE_NOP>(
+					*block, block->page.frame + offs,
+					0x80U);
+				mtr->memset(block, offs + 1,
+					    DATA_ROLL_PTR_LEN - 1, 0);
+			}
+		}
+	}
+
+	mtr->commit();
+}
+
+/***********************************************************//**
+Purges an update of an existing record. Also purges an update of a delete
+marked record if that record contained an externally stored field. */
+static
+void
+row_purge_upd_exist_or_extern_func(
+/*===============================*/
+#ifdef UNIV_DEBUG
+	const que_thr_t*thr,		/*!< in: query thread */
+#endif /* UNIV_DEBUG */
+	purge_node_t*	node,		/*!< in: row purge node */
+	const trx_undo_rec_t*	undo_rec)	/*!< in: record to purge */
+{
+	mem_heap_t*	heap;
+
+	ut_ad(!node->table->skip_alter_undo);
+
+	if (node->rec_type == TRX_UNDO_UPD_DEL_REC
+	    || (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)
+	    || !node->index) {
+
+		goto skip_secondaries;
+	}
+
+	heap = mem_heap_create(1024);
+
+	do {
+		if (node->index->type & (DICT_FTS | DICT_CORRUPT)) {
+			continue;
+		}
+
+		if (!node->index->is_committed()) {
+			continue;
+		}
+
+		if (row_upd_changes_ord_field_binary(node->index, node->update,
+						     thr, NULL, NULL)) {
+			/* Build the older version of the index entry */
+			dtuple_t*	entry = row_build_index_entry_low(
+				node->row, NULL, node->index,
+				heap, ROW_BUILD_FOR_PURGE);
+			row_purge_remove_sec_if_poss(node, node->index, entry);
+
+			ut_ad(node->table);
+
+			mem_heap_empty(heap);
+		}
+	} while ((node->index = dict_table_get_next_index(node->index)));
+
+	mem_heap_free(heap);
+
+skip_secondaries:
+	mtr_t		mtr;
+	dict_index_t*	index = dict_table_get_first_index(node->table);
+	/* Free possible externally stored fields */
+	for (ulint i = 0; i < upd_get_n_fields(node->update); i++) {
+
+		const upd_field_t*	ufield
+			= upd_get_nth_field(node->update, i);
+
+		if (dfield_is_ext(&ufield->new_val)) {
+			bool		is_insert;
+			ulint		rseg_id;
+			uint32_t	page_no;
+			uint16_t	offset;
+
+			/* We use the fact that new_val points to
+			undo_rec and get thus the offset of
+			dfield data inside the undo record. Then we
+			can calculate from node->roll_ptr the file
+			address of the new_val data */
+
+			const uint16_t internal_offset = uint16_t(
+				static_cast<const byte*>
+				(dfield_get_data(&ufield->new_val))
+				- undo_rec);
+
+			ut_a(internal_offset < srv_page_size);
+
+			trx_undo_decode_roll_ptr(node->roll_ptr,
+						 &is_insert, &rseg_id,
+						 &page_no, &offset);
+
+			const trx_rseg_t &rseg = trx_sys.rseg_array[rseg_id];
+			ut_ad(rseg.is_persistent());
+
+			mtr.start();
+
+			/* We have to acquire an SX-latch to the clustered
+			index tree (exclude other tree changes) */
+
+			mtr_sx_lock_index(index, &mtr);
+
+			index->set_modified(mtr);
+
+			/* NOTE: we must also acquire a U latch to the
+			root page of the tree. We will need it when we
+			free pages from the tree. If the tree is of height 1,
+			the tree X-latch does NOT protect the root page,
+			because it is also a leaf page. Since we will have a
+			latch on an undo log page, we would break the
+			latching order if we would only later latch the
+			root page of such a tree! */
+
+			dberr_t err;
+			if (!btr_root_block_get(index, RW_SX_LATCH, &mtr,
+						&err)) {
+			} else if (buf_block_t* block =
+				   buf_page_get(page_id_t(rseg.space->id,
+							  page_no),
+						0, RW_X_LATCH, &mtr)) {
+				block->page.set_accessed();
+				buf_page_make_young_if_needed(&block->page);
+
+				byte* data_field = block->page.frame
+					+ offset + internal_offset;
+
+				ut_a(dfield_get_len(&ufield->new_val)
+				     >= BTR_EXTERN_FIELD_REF_SIZE);
+				btr_free_externally_stored_field(
+					index,
+					data_field
+					+ dfield_get_len(&ufield->new_val)
+					- BTR_EXTERN_FIELD_REF_SIZE,
+					NULL, NULL, block, 0, false, &mtr);
+			}
+
+			mtr.commit();
+		}
+	}
+
+	row_purge_reset_trx_id(node, &mtr);
+}
+
+#ifdef UNIV_DEBUG
+# define row_purge_upd_exist_or_extern(thr,node,undo_rec)	\
+	row_purge_upd_exist_or_extern_func(thr,node,undo_rec)
+#else /* UNIV_DEBUG */
+# define row_purge_upd_exist_or_extern(thr,node,undo_rec)	\
+	row_purge_upd_exist_or_extern_func(node,undo_rec)
+#endif /* UNIV_DEBUG */
+
+/** Build a partial row from an update undo log record for purge.
+Any columns which occur as ordering in any index of the table are present.
+Any missing columns are indicated by col->mtype == DATA_MISSING.
+
+@param ptr    remaining part of the undo log record
+@param index  clustered index
+@param node   purge node
+@return pointer to remaining part of undo record */
+static byte *row_purge_get_partial(const byte *ptr, const dict_index_t &index,
+                                   purge_node_t *node)
+{
+  bool first_v_col= true;
+  bool is_undo_log= true;
+
+  ut_ad(index.is_primary());
+  ut_ad(index.n_uniq == node->ref->n_fields);
+
+  node->row= dtuple_create_with_vcol(node->heap, index.table->n_cols,
+                                     index.table->n_v_cols);
+
+  /* Mark all columns in the row uninitialized, so that
+  we can distinguish missing fields from fields that are SQL NULL. */
+  for (ulint i= 0; i < index.table->n_cols; i++)
+    node->row->fields[i].type.mtype= DATA_MISSING;
+
+  dtuple_init_v_fld(node->row);
+
+  for (const upd_field_t *uf= node->update->fields, *const ue=
+         node->update->fields + node->update->n_fields; uf != ue; uf++)
+  {
+    if (!uf->old_v_val)
+    {
+      const dict_col_t &c= *dict_index_get_nth_col(&index, uf->field_no);
+      if (!c.is_dropped())
+        node->row->fields[c.ind]= uf->new_val;
+    }
+  }
+
+  const byte *end_ptr= ptr + mach_read_from_2(ptr);
+  ptr+= 2;
+
+  while (ptr != end_ptr)
+  {
+    dfield_t *dfield;
+    const byte *field;
+    const dict_col_t *col;
+    uint32_t len, orig_len, field_no= mach_read_next_compressed(&ptr);
+
+    if (field_no >= REC_MAX_N_FIELDS)
+    {
+      ptr= trx_undo_read_v_idx(index.table, ptr, first_v_col, &is_undo_log,
+                               &field_no);
+      first_v_col= false;
+
+      ptr= trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+      if (field_no == FIL_NULL)
+        continue; /* there no longer is an index on the virtual column */
+
+      dict_v_col_t *vcol= dict_table_get_nth_v_col(index.table, field_no);
+      col =&vcol->m_col;
+      dfield= dtuple_get_nth_v_field(node->row, vcol->v_pos);
+      dict_col_copy_type(&vcol->m_col, &dfield->type);
+    }
+    else
+    {
+      ptr= trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+      col= dict_index_get_nth_col(&index, field_no);
+      if (col->is_dropped())
+        continue;
+      dfield= dtuple_get_nth_field(node->row, col->ind);
+      ut_ad(dfield->type.mtype == DATA_MISSING ||
+            dict_col_type_assert_equal(col, &dfield->type));
+      ut_ad(dfield->type.mtype == DATA_MISSING ||
+            dfield->len == len ||
+            (len != UNIV_SQL_NULL && len >= UNIV_EXTERN_STORAGE_FIELD));
+      dict_col_copy_type(dict_table_get_nth_col(index.table, col->ind),
+                         &dfield->type);
+    }
+
+    dfield_set_data(dfield, field, len);
+
+    if (len == UNIV_SQL_NULL || len < UNIV_EXTERN_STORAGE_FIELD)
+      continue;
+
+    spatial_status_t spatial_status= static_cast<spatial_status_t>
+      ((len & SPATIAL_STATUS_MASK) >> SPATIAL_STATUS_SHIFT);
+    len&= ~SPATIAL_STATUS_MASK;
+
+    /* Keep compatible with 5.7.9 format. */
+    if (spatial_status == SPATIAL_UNKNOWN)
+      spatial_status= dict_col_get_spatial_status(col);
+
+    switch (UNIV_EXPECT(spatial_status, SPATIAL_NONE)) {
+    case SPATIAL_ONLY:
+      ut_ad(len - UNIV_EXTERN_STORAGE_FIELD == DATA_MBR_LEN);
+      dfield_set_len(dfield, len - UNIV_EXTERN_STORAGE_FIELD);
+      break;
+
+    case SPATIAL_MIXED:
+      dfield_set_len(dfield, len - UNIV_EXTERN_STORAGE_FIELD - DATA_MBR_LEN);
+      break;
+
+    default:
+      dfield_set_len(dfield, len - UNIV_EXTERN_STORAGE_FIELD);
+      break;
+    }
+
+    dfield_set_ext(dfield);
+    dfield_set_spatial_status(dfield, spatial_status);
+
+    if (!col->ord_part || spatial_status == SPATIAL_ONLY ||
+        node->rec_type == TRX_UNDO_UPD_DEL_REC)
+      continue;
+    /* If the prefix of this BLOB column is indexed, ensure that enough
+    prefix is stored in the undo log record. */
+    ut_a(dfield_get_len(dfield) >= BTR_EXTERN_FIELD_REF_SIZE);
+    ut_a(dict_table_has_atomic_blobs(index.table) ||
+         dfield_get_len(dfield) >=
+         REC_ANTELOPE_MAX_INDEX_COL_LEN + BTR_EXTERN_FIELD_REF_SIZE);
+  }
+
+  for (ulint i= 0; i < index.n_uniq; i++)
+  {
+    dfield_t &field= node->row->fields[index.fields[i].col->ind];
+    if (field.type.mtype == DATA_MISSING)
+      field= node->ref->fields[i];
+  }
+
+  return const_cast<byte*>(ptr);
+}
+
+MY_ATTRIBUTE((nonnull,warn_unused_result))
+/** Parses the row reference and other info in a modify undo log record.
+@param[in]	node		row undo node
+@param[in]	undo_rec	record to purge
+@param[in]	thr		query thread
+@param[out]	updated_extern	true if an externally stored field was
+				updated
+@return true if purge operation required */
+static
+bool
+row_purge_parse_undo_rec(
+	purge_node_t*		node,
+	const trx_undo_rec_t*	undo_rec,
+	que_thr_t*		thr,
+	bool*			updated_extern)
+{
+	dict_index_t*	clust_index;
+	undo_no_t	undo_no;
+	table_id_t	table_id;
+	roll_ptr_t	roll_ptr;
+	byte		info_bits;
+	byte		type;
+
+	const byte* ptr = trx_undo_rec_get_pars(
+		undo_rec, &type, &node->cmpl_info,
+		updated_extern, &undo_no, &table_id);
+
+	node->rec_type = type;
+
+	switch (type) {
+	case TRX_UNDO_RENAME_TABLE:
+		return false;
+	case TRX_UNDO_EMPTY:
+	case TRX_UNDO_INSERT_METADATA:
+	case TRX_UNDO_INSERT_REC:
+		/* These records do not store any transaction identifier. */
+		node->trx_id = TRX_ID_MAX;
+		break;
+	default:
+#ifdef UNIV_DEBUG
+		ut_ad("unknown undo log record type" == 0);
+		return false;
+	case TRX_UNDO_UPD_DEL_REC:
+	case TRX_UNDO_UPD_EXIST_REC:
+	case TRX_UNDO_DEL_MARK_REC:
+#endif /* UNIV_DEBUG */
+		ptr = trx_undo_update_rec_get_sys_cols(ptr, &node->trx_id,
+						       &roll_ptr, &info_bits);
+		break;
+	}
+
+	auto &tables_entry= node->tables[table_id];
+	node->table = tables_entry.first;
+	if (!node->table) {
+		return false;
+	}
+
+#ifndef DBUG_OFF
+	if (MDL_ticket* mdl = tables_entry.second) {
+		static_cast<MDL_context*>(thd_mdl_context(current_thd))
+			->lock_warrant = mdl->get_ctx();
+	}
+#endif
+	ut_ad(!node->table->is_temporary());
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	if (clust_index->is_corrupted()) {
+		/* The table was corrupt in the data dictionary.
+		dict_set_corrupted() works on an index, and
+		we do not have an index to call it with. */
+		DBUG_ASSERT(table_id == node->table->id);
+		return false;
+	}
+
+	switch (type) {
+	case TRX_UNDO_INSERT_METADATA:
+		node->ref = &trx_undo_metadata;
+		return true;
+	case TRX_UNDO_EMPTY:
+		node->ref = nullptr;
+		return true;
+	}
+
+	ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+				       node->heap);
+
+	if (type == TRX_UNDO_INSERT_REC) {
+		return(true);
+	}
+
+	ptr = trx_undo_update_rec_get_update(ptr, clust_index, type,
+					     node->trx_id,
+					     roll_ptr, info_bits,
+					     node->heap, &(node->update));
+
+	/* Read to the partial row the fields that occur in indexes */
+
+	if (!(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+		ut_ad(!(node->update->info_bits & REC_INFO_MIN_REC_FLAG));
+		ptr = row_purge_get_partial(ptr, *clust_index, node);
+	} else if (node->update->info_bits & REC_INFO_MIN_REC_FLAG) {
+		node->ref = &trx_undo_metadata;
+	}
+
+	return(true);
+}
+
+/** Purges the parsed record.
+@param[in]	node		row purge node
+@param[in]	undo_rec	record to purge
+@param[in]	thr		query thread
+@param[in]	updated_extern	whether external columns were updated
+@return true if purged, false if skipped */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_purge_record_func(
+	purge_node_t*	node,
+	const trx_undo_rec_t*	undo_rec,
+#if defined UNIV_DEBUG || defined WITH_WSREP
+	const que_thr_t*thr,
+#endif /* UNIV_DEBUG || WITH_WSREP */
+	bool		updated_extern)
+{
+	ut_ad(!node->found_clust);
+	ut_ad(!node->table->skip_alter_undo);
+	ut_ad(!trx_undo_roll_ptr_is_insert(node->roll_ptr));
+
+	node->index = dict_table_get_next_index(
+		dict_table_get_first_index(node->table));
+
+	bool purged = true;
+
+	switch (node->rec_type) {
+	case TRX_UNDO_EMPTY:
+		break;
+	case TRX_UNDO_DEL_MARK_REC:
+		purged = row_purge_del_mark(node);
+		if (purged) {
+			if (node->table->stat_initialized
+			    && srv_stats_include_delete_marked) {
+				dict_stats_update_if_needed(
+					node->table, *thr->graph->trx);
+			}
+			MONITOR_INC(MONITOR_N_DEL_ROW_PURGE);
+		}
+		break;
+	case TRX_UNDO_INSERT_METADATA:
+	case TRX_UNDO_INSERT_REC:
+		node->roll_ptr |= 1ULL << ROLL_PTR_INSERT_FLAG_POS;
+		/* fall through */
+	default:
+		if (!updated_extern) {
+			mtr_t		mtr;
+			row_purge_reset_trx_id(node, &mtr);
+			break;
+		}
+		/* fall through */
+	case TRX_UNDO_UPD_EXIST_REC:
+		row_purge_upd_exist_or_extern(thr, node, undo_rec);
+		MONITOR_INC(MONITOR_N_UPD_EXIST_EXTERN);
+		break;
+	}
+
+	if (node->found_clust) {
+		node->found_clust = false;
+		btr_pcur_close(&node->pcur);
+	}
+
+	return(purged);
+}
+
+#if defined UNIV_DEBUG || defined WITH_WSREP
+# define row_purge_record(node,undo_rec,thr,updated_extern)	\
+	row_purge_record_func(node,undo_rec,thr,updated_extern)
+#else /* UNIV_DEBUG || WITH_WSREP */
+# define row_purge_record(node,undo_rec,thr,updated_extern)	\
+	row_purge_record_func(node,undo_rec,updated_extern)
+#endif /* UNIV_DEBUG || WITH_WSREP */
+
+/***********************************************************//**
+Fetches an undo log record and does the purge for the recorded operation.
+If none left, or the current purge completed, returns the control to the
+parent node, which is always a query thread node. */
+static MY_ATTRIBUTE((nonnull))
+void
+row_purge(
+/*======*/
+	purge_node_t*	node,		/*!< in: row purge node */
+	const trx_undo_rec_t*	undo_rec,	/*!< in: record to purge */
+	que_thr_t*	thr)		/*!< in: query thread */
+{
+	if (undo_rec != reinterpret_cast<trx_undo_rec_t*>(-1)) {
+		bool	updated_extern;
+
+		while (row_purge_parse_undo_rec(
+			       node, undo_rec, thr, &updated_extern)) {
+
+			bool purged = row_purge_record(
+				node, undo_rec, thr, updated_extern);
+
+			if (purged
+			    || srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
+				return;
+			}
+
+			/* Retry the purge in a second. */
+			std::this_thread::sleep_for(std::chrono::seconds(1));
+		}
+	}
+}
+
+inline void purge_node_t::start()
+{
+  ut_ad(in_progress);
+  DBUG_ASSERT(common.type == QUE_NODE_PURGE);
+
+  row= nullptr;
+  ref= nullptr;
+  index= nullptr;
+  update= nullptr;
+  found_clust= false;
+  rec_type= 0;
+  cmpl_info= 0;
+}
+
+/** Reset the state at end
+@return the query graph parent */
+inline que_node_t *purge_node_t::end(THD *thd)
+{
+  DBUG_ASSERT(common.type == QUE_NODE_PURGE);
+  ut_ad(undo_recs.empty());
+  ut_d(in_progress= false);
+  innobase_reset_background_thd(thd);
+#ifndef DBUG_OFF
+  static_cast<MDL_context*>(thd_mdl_context(thd))->lock_warrant= nullptr;
+#endif
+  mem_heap_empty(heap);
+  return common.parent;
+}
+
+
+/***********************************************************//**
+Does the purge operation.
+@return query thread to run next */
+que_thr_t*
+row_purge_step(
+/*===========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	purge_node_t*	node;
+
+	node = static_cast<purge_node_t*>(thr->run_node);
+
+	node->start();
+
+	while (!node->undo_recs.empty()) {
+		trx_purge_rec_t purge_rec = node->undo_recs.front();
+		node->undo_recs.pop();
+		node->roll_ptr = purge_rec.roll_ptr;
+
+		row_purge(node, purge_rec.undo_rec, thr);
+	}
+
+	thr->run_node = node->end(current_thd);
+	return(thr);
+}
+
+#ifdef UNIV_DEBUG
+/***********************************************************//**
+Validate the persisent cursor. The purge node has two references
+to the clustered index record - one via the ref member, and the
+other via the persistent cursor.  These two references must match
+each other if the found_clust flag is set.
+@return true if the stored copy of persistent cursor is consistent
+with the ref member.*/
+bool
+purge_node_t::validate_pcur()
+{
+	if (!found_clust) {
+		return(true);
+	}
+
+	if (index == NULL) {
+		return(true);
+	}
+
+	if (index->type == DICT_FTS) {
+		return(true);
+	}
+
+	if (!pcur.old_rec) {
+		return(true);
+	}
+
+	dict_index_t* clust_index = pcur.index();
+
+	rec_offs* offsets = rec_get_offsets(
+		pcur.old_rec, clust_index, NULL, pcur.old_n_core_fields,
+		pcur.old_n_fields, &heap);
+
+	/* Here we are comparing the purge ref record and the stored initial
+	part in persistent cursor. Both cases we store n_uniq fields of the
+	cluster index and so it is fine to do the comparison. We note this
+	dependency here as pcur and ref belong to different modules. */
+	int st = cmp_dtuple_rec(ref, pcur.old_rec, clust_index, offsets);
+
+	if (st != 0) {
+		ib::error() << "Purge node pcur validation failed";
+		ib::error() << rec_printer(ref).str();
+		ib::error() << rec_printer(pcur.old_rec, offsets).str();
+		return(false);
+	}
+
+	return(true);
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/row/row0quiesce.cc b/storage/innobase/row/row0quiesce.cc
new file mode 100644
index 00000000..e927096f
--- /dev/null
+++ b/storage/innobase/row/row0quiesce.cc
@@ -0,0 +1,715 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0quiesce.cc
+Quiesce a tablespace.
+
+Created 2012-02-08 by Sunny Bains.
+*******************************************************/
+
+#include "row0quiesce.h"
+#include "row0mysql.h"
+#include "buf0flu.h"
+#include "ibuf0ibuf.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+
+#ifdef HAVE_MY_AES_H
+#include <my_aes.h>
+#endif
+
+/*********************************************************************//**
+Write the meta data (index user fields) config file.
+@return DB_SUCCESS or error code. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_index_fields(
+/*===========================*/
+	const dict_index_t*	index,	/*!< in: write the meta data for
+					this index */
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd)	/*!< in/out: session */
+{
+	byte			row[sizeof(ib_uint32_t) * 2];
+
+	for (ulint i = 0; i < index->n_fields; ++i) {
+		byte*			ptr = row;
+		const dict_field_t*	field = &index->fields[i];
+
+		mach_write_to_4(ptr, field->prefix_len);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, field->fixed_len);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_9",
+				close(fileno(file)););
+
+		if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				(ulong) errno, strerror(errno),
+				"while writing index fields.");
+
+			return(DB_IO_ERROR);
+		}
+
+		const char* field_name = field->name ? field->name : "";
+		/* Include the NUL byte in the length. */
+		ib_uint32_t	len = static_cast<ib_uint32_t>(strlen(field_name) + 1);
+		mach_write_to_4(row, len);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_10",
+				close(fileno(file)););
+
+		if (fwrite(row, 1,  sizeof(len), file) != sizeof(len)
+		    || fwrite(field_name, 1, len, file) != len) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				(ulong) errno, strerror(errno),
+				"while writing index column.");
+
+			return(DB_IO_ERROR);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Write the meta data config file index information.
+@return DB_SUCCESS or error code. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_indexes(
+/*======================*/
+	const dict_table_t*	table,	/*!< in: write the meta data for
+					this table */
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd)	/*!< in/out: session */
+{
+	ulint n_indexes = 0;
+	for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+	     index; index = UT_LIST_GET_NEXT(indexes, index)) {
+		n_indexes += index->is_committed();
+	}
+
+	{
+		byte		row[sizeof(ib_uint32_t)];
+
+		/* Write the number of indexes in the table. */
+		mach_write_to_4(row, n_indexes);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_11",
+				close(fileno(file)););
+
+		if (fwrite(row, 1,  sizeof(row), file) != sizeof(row)) {
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				(ulong) errno, strerror(errno),
+				"while writing index count.");
+
+			return(DB_IO_ERROR);
+		}
+	}
+
+	dberr_t			err = DB_SUCCESS;
+
+	/* Write the index meta data. */
+	for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+	     index != 0 && err == DB_SUCCESS;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+		if (!index->is_committed()) {
+			continue;
+		}
+
+		ut_ad(n_indexes); ut_d(n_indexes--);
+
+		byte*		ptr;
+		byte		row[sizeof(index_id_t)
+				    + sizeof(ib_uint32_t) * 8];
+
+		ptr = row;
+
+		ut_ad(sizeof(index_id_t) == 8);
+		mach_write_to_8(ptr, index->id);
+		ptr += sizeof(index_id_t);
+
+		mach_write_to_4(ptr, table->space_id);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->page);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->type);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->trx_id_offset);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->n_user_defined_cols);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->n_uniq);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->n_nullable);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->n_fields);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_12",
+				close(fileno(file)););
+
+		if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				(ulong) errno, strerror(errno),
+				"while writing index meta-data.");
+
+			return(DB_IO_ERROR);
+		}
+
+		/* Write the length of the index name.
+		NUL byte is included in the length. */
+		ib_uint32_t	len = static_cast<ib_uint32_t>(strlen(index->name) + 1);
+		ut_a(len > 1);
+
+		mach_write_to_4(row, len);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_1",
+				close(fileno(file)););
+
+		if (fwrite(row, 1, sizeof(len), file) != sizeof(len)
+		    || fwrite(index->name, 1, len, file) != len) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				(ulong) errno, strerror(errno),
+				"while writing index name.");
+
+			return(DB_IO_ERROR);
+		}
+
+		err = row_quiesce_write_index_fields(index, file, thd);
+	}
+
+	ut_ad(!n_indexes);
+	return(err);
+}
+
+/*********************************************************************//**
+Write the meta data (table columns) config file. Serialise the contents of
+dict_col_t structure, along with the column name. All fields are serialized
+as ib_uint32_t.
+@return DB_SUCCESS or error code. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_table(
+/*====================*/
+	const dict_table_t*	table,	/*!< in: write the meta data for
+					this table */
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd)	/*!< in/out: session */
+{
+	dict_col_t*		col;
+	byte			row[sizeof(ib_uint32_t) * 7];
+
+	col = table->cols;
+
+	for (ulint i = 0; i < table->n_cols; ++i, ++col) {
+		byte*		ptr = row;
+
+		mach_write_to_4(ptr, col->prtype);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->mtype);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->len);
+		ptr += sizeof(ib_uint32_t);
+
+		/* FIXME: This will not work if mbminlen>4.
+		This field is also redundant, because the lengths
+		are a property of the character set encoding, which
+		in turn is encodedin prtype above. */
+		mach_write_to_4(ptr, ulint(col->mbmaxlen * 5 + col->mbminlen));
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->ind);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->ord_part);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->max_prefix);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_2",
+				close(fileno(file)););
+
+		if (fwrite(row, 1,  sizeof(row), file) != sizeof(row)) {
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				(ulong) errno, strerror(errno),
+				"while writing table column data.");
+
+			return(DB_IO_ERROR);
+		}
+
+		/* Write out the column name as [len, byte array]. The len
+		includes the NUL byte. */
+		ib_uint32_t	len;
+		const char*	col_name;
+
+		col_name = dict_table_get_col_name(table, dict_col_get_no(col));
+
+		/* Include the NUL byte in the length. */
+		len = static_cast<ib_uint32_t>(strlen(col_name) + 1);
+		ut_a(len > 1);
+
+		mach_write_to_4(row, len);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_3",
+				close(fileno(file)););
+
+		if (fwrite(row, 1,  sizeof(len), file) != sizeof(len)
+		    || fwrite(col_name, 1, len, file) != len) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				(ulong) errno, strerror(errno),
+				"while writing column name.");
+
+			return(DB_IO_ERROR);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Write the meta data config file header.
+@return DB_SUCCESS or error code. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_header(
+/*=====================*/
+	const dict_table_t*	table,	/*!< in: write the meta data for
+					this table */
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd)	/*!< in/out: session */
+{
+	byte			value[sizeof(ib_uint32_t)];
+
+	/* Write the meta-data version number. */
+	mach_write_to_4(value, IB_EXPORT_CFG_VERSION_V1);
+
+	DBUG_EXECUTE_IF("ib_export_io_write_failure_4", close(fileno(file)););
+
+	if (fwrite(&value, 1,  sizeof(value), file) != sizeof(value)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+			(ulong) errno, strerror(errno),
+			"while writing meta-data version number.");
+
+		return(DB_IO_ERROR);
+	}
+
+	/* Write the server hostname. */
+	ib_uint32_t		len;
+	const char*		hostname = server_get_hostname();
+
+	/* Play it safe and check for NULL. */
+	if (hostname == 0) {
+		static const char	NullHostname[] = "Hostname unknown";
+
+		ib::warn() << "Unable to determine server hostname.";
+
+		hostname = NullHostname;
+	}
+
+	/* The server hostname includes the NUL byte. */
+	len = static_cast<ib_uint32_t>(strlen(hostname) + 1);
+	mach_write_to_4(value, len);
+
+	DBUG_EXECUTE_IF("ib_export_io_write_failure_5", close(fileno(file)););
+
+	if (fwrite(&value, 1,  sizeof(value), file) != sizeof(value)
+	    || fwrite(hostname, 1,  len, file) != len) {
+
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+			(ulong) errno, strerror(errno),
+			"while writing hostname.");
+
+		return(DB_IO_ERROR);
+	}
+
+	/* The table name includes the NUL byte. */
+	ut_a(table->name.m_name != NULL);
+	len = static_cast<ib_uint32_t>(strlen(table->name.m_name) + 1);
+
+	/* Write the table name. */
+	mach_write_to_4(value, len);
+
+	DBUG_EXECUTE_IF("ib_export_io_write_failure_6", close(fileno(file)););
+
+	if (fwrite(&value, 1,  sizeof(value), file) != sizeof(value)
+	    || fwrite(table->name.m_name, 1,  len, file) != len) {
+
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+			(ulong) errno, strerror(errno),
+			"while writing table name.");
+
+		return(DB_IO_ERROR);
+	}
+
+	byte		row[sizeof(ib_uint32_t) * 3];
+
+	/* Write the next autoinc value. */
+	mach_write_to_8(row, table->autoinc);
+
+	DBUG_EXECUTE_IF("ib_export_io_write_failure_7", close(fileno(file)););
+
+	if (fwrite(row, 1,  sizeof(ib_uint64_t), file) != sizeof(ib_uint64_t)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+			(ulong) errno, strerror(errno),
+			"while writing table autoinc value.");
+
+		return(DB_IO_ERROR);
+	}
+
+	byte*		ptr = row;
+
+	/* Write the system page size. */
+	mach_write_to_4(ptr, srv_page_size);
+	ptr += sizeof(ib_uint32_t);
+
+	/* Write the table->flags. */
+	mach_write_to_4(ptr, table->flags);
+	ptr += sizeof(ib_uint32_t);
+
+	/* Write the number of columns in the table. */
+	mach_write_to_4(ptr, table->n_cols);
+
+	DBUG_EXECUTE_IF("ib_export_io_write_failure_8", close(fileno(file)););
+
+	if (fwrite(row, 1,  sizeof(row), file) != sizeof(row)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+			(ulong) errno, strerror(errno),
+			"while writing table meta-data.");
+
+		return(DB_IO_ERROR);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Write the table meta data after quiesce.
+@return DB_SUCCESS or error code */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_cfg(
+/*==================*/
+	dict_table_t*	table,	/*!< in: write the meta data for
+					this table */
+	THD*			thd)	/*!< in/out: session */
+{
+	dberr_t			err;
+	char			name[OS_FILE_MAX_PATH];
+
+	srv_get_meta_data_filename(table, name, sizeof(name));
+
+	ib::info() << "Writing table metadata to '" << name << "'";
+
+	FILE*	file = fopen(name, "w+b");
+
+	if (file == NULL) {
+		ib_errf(thd, IB_LOG_LEVEL_WARN, ER_CANT_CREATE_FILE,
+			 name, errno, strerror(errno));
+
+		err = DB_IO_ERROR;
+	} else {
+		err = row_quiesce_write_header(table, file, thd);
+
+		if (err == DB_SUCCESS) {
+			err = row_quiesce_write_table(table, file, thd);
+		}
+
+		if (err == DB_SUCCESS) {
+			err = row_quiesce_write_indexes(table, file, thd);
+		}
+
+		if (fflush(file) != 0) {
+
+			char	msg[BUFSIZ];
+
+			snprintf(msg, sizeof(msg), "%s flush() failed", name);
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				(ulong) errno, strerror(errno), msg);
+		}
+
+		if (fclose(file) != 0) {
+			char	msg[BUFSIZ];
+
+			snprintf(msg, sizeof(msg), "%s flose() failed", name);
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				(ulong) errno, strerror(errno), msg);
+		}
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Check whether a table has an FTS index defined on it.
+@return true if an FTS index exists on the table */
+static
+bool
+row_quiesce_table_has_fts_index(
+/*============================*/
+	const dict_table_t*	table)	/*!< in: quiesce this table */
+{
+	bool			exists = false;
+
+	for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+	     index != 0;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+		if (index->type & DICT_FTS) {
+			exists = true;
+			break;
+		}
+	}
+
+	return(exists);
+}
+
+/*********************************************************************//**
+Quiesce the tablespace that the table resides in. */
+void
+row_quiesce_table_start(
+/*====================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	trx_t*		trx)		/*!< in/out: transaction/session */
+{
+	ut_a(trx->mysql_thd != 0);
+	ut_a(srv_n_purge_threads > 0);
+	ut_ad(!srv_read_only_mode);
+
+	ut_a(trx->mysql_thd != 0);
+
+	ut_ad(table->space != NULL);
+	ib::info() << "Sync to disk of " << table->name << " started.";
+
+	if (srv_undo_sources) {
+		purge_sys.stop();
+	}
+
+	for (ulint count = 0;
+	     ibuf_merge_space(table->space_id);
+	     ++count) {
+		if (trx_is_interrupted(trx)) {
+			goto aborted;
+		}
+		if (!(count % 20)) {
+			ib::info() << "Merging change buffer entries for "
+				<< table->name;
+		}
+	}
+
+	while (buf_flush_list_space(table->space)) {
+		if (trx_is_interrupted(trx)) {
+			goto aborted;
+		}
+	}
+
+	if (!trx_is_interrupted(trx)) {
+		/* Ensure that all asynchronous IO is completed. */
+		os_aio_wait_until_no_pending_writes(true);
+		table->space->flush<false>();
+
+		if (row_quiesce_write_cfg(table, trx->mysql_thd)
+		    != DB_SUCCESS) {
+			ib::warn() << "There was an error writing to the"
+				" meta data file";
+		} else {
+			ib::info() << "Table " << table->name
+				<< " flushed to disk";
+		}
+	} else {
+aborted:
+		ib::warn() << "Quiesce aborted!";
+	}
+
+	dberr_t	err = row_quiesce_set_state(table, QUIESCE_COMPLETE, trx);
+	ut_a(err == DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Cleanup after table quiesce. */
+void
+row_quiesce_table_complete(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	trx_t*		trx)		/*!< in/out: transaction/session */
+{
+	ulint		count = 0;
+
+	ut_a(trx->mysql_thd != 0);
+
+	/* We need to wait for the operation to complete if the
+	transaction has been killed. */
+
+	while (table->quiesce != QUIESCE_COMPLETE) {
+
+		/* Print a warning after every minute. */
+		if (!(count % 60)) {
+			ib::warn() << "Waiting for quiesce of " << table->name
+				<< " to complete";
+		}
+
+		std::this_thread::sleep_for(std::chrono::seconds(1));
+
+		++count;
+	}
+
+	if (!opt_bootstrap) {
+		/* Remove the .cfg file now that the user has resumed
+		normal operations. Otherwise it will cause problems when
+		the user tries to drop the database (remove directory). */
+		char		cfg_name[OS_FILE_MAX_PATH];
+
+		srv_get_meta_data_filename(table, cfg_name, sizeof(cfg_name));
+
+		os_file_delete_if_exists(innodb_data_file_key, cfg_name, NULL);
+
+		ib::info() << "Deleting the meta-data file '" << cfg_name << "'";
+	}
+
+	if (srv_undo_sources) {
+		purge_sys.resume();
+	}
+
+	dberr_t	err = row_quiesce_set_state(table, QUIESCE_NONE, trx);
+	ut_a(err == DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Set a table's quiesce state.
+@return DB_SUCCESS or error code. */
+dberr_t
+row_quiesce_set_state(
+/*==================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	ib_quiesce_t	state,		/*!< in: quiesce state to set */
+	trx_t*		trx)		/*!< in/out: transaction */
+{
+	ut_a(srv_n_purge_threads > 0);
+
+	if (srv_read_only_mode) {
+
+		ib_senderrf(trx->mysql_thd,
+			    IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+
+		return(DB_UNSUPPORTED);
+
+	} else if (table->is_temporary()) {
+
+		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+			    ER_CANNOT_DISCARD_TEMPORARY_TABLE);
+
+		return(DB_UNSUPPORTED);
+	} else if (table->space_id == TRX_SYS_SPACE) {
+
+		char	table_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			table_name, sizeof(table_name),
+			table->name.m_name);
+
+		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+			    ER_TABLE_IN_SYSTEM_TABLESPACE, table_name);
+
+		return(DB_UNSUPPORTED);
+	} else if (row_quiesce_table_has_fts_index(table)) {
+
+		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+			    ER_NOT_SUPPORTED_YET,
+			    "FLUSH TABLES on tables that have an FTS index."
+			    " FTS auxiliary tables will not be flushed.");
+
+	} else if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+		/* If this flag is set then the table may not have any active
+		FTS indexes but it will still have the auxiliary tables. */
+
+		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+			    ER_NOT_SUPPORTED_YET,
+			    "FLUSH TABLES on a table that had an FTS index,"
+			    " created on a hidden column, the"
+			    " auxiliary tables haven't been dropped as yet."
+			    " FTS auxiliary tables will not be flushed.");
+	}
+
+	dict_index_t* clust_index = dict_table_get_first_index(table);
+
+	for (dict_index_t* index = dict_table_get_next_index(clust_index);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+		index->lock.x_lock(SRW_LOCK_CALL);
+	}
+
+	clust_index->lock.x_lock(SRW_LOCK_CALL);
+
+	switch (state) {
+	case QUIESCE_START:
+		break;
+
+	case QUIESCE_COMPLETE:
+		ut_a(table->quiesce == QUIESCE_START);
+		break;
+
+	case QUIESCE_NONE:
+		ut_a(table->quiesce == QUIESCE_COMPLETE);
+		break;
+	}
+
+	table->quiesce = state;
+
+	for (dict_index_t* index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+		index->lock.x_unlock();
+	}
+
+	return(DB_SUCCESS);
+}
+
diff --git a/storage/innobase/row/row0row.cc b/storage/innobase/row/row0row.cc
new file mode 100644
index 00000000..4a00b2a4
--- /dev/null
+++ b/storage/innobase/row/row0row.cc
@@ -0,0 +1,1720 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0row.cc
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0row.h"
+#include "data0type.h"
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0ext.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "ut0mem.h"
+#include "gis0geo.h"
+#include "row0mysql.h"
+
+/** Build a spatial index key.
+@param[in]	index	spatial index
+@param[in]	ext	externally stored column prefixes, or NULL
+@param[in,out]	dfield	field of the tuple to be copied
+@param[in]	dfield2	field of the tuple to copy
+@param[in]	flag	ROW_BUILD_NORMAL, ROW_BUILD_FOR_PURGE or
+			ROW_BUILD_FOR_UNDO
+@param[in,out]	heap	memory heap from which the memory
+			of the field entry is allocated.
+@retval false if undo log is logged before spatial index creation. */
+static bool row_build_spatial_index_key(
+	const dict_index_t*	index,
+	const row_ext_t*	ext,
+	dfield_t*		dfield,
+	const dfield_t*		dfield2,
+	ulint			flag,
+	mem_heap_t*		heap)
+{
+	if (dfield2->type.mtype == DATA_MISSING) {
+		return false;
+	}
+
+	double*			mbr;
+
+	dfield_copy(dfield, dfield2);
+	dfield->type.prtype |= DATA_GIS_MBR;
+
+	/* Allocate memory for mbr field */
+	mbr = static_cast<double*>(mem_heap_alloc(heap, DATA_MBR_LEN));
+
+	/* Set mbr field data. */
+	dfield_set_data(dfield, mbr, DATA_MBR_LEN);
+
+	const fil_space_t* space = index->table->space;
+
+	if (UNIV_UNLIKELY(!dfield2->data || !space)) {
+		/* FIXME: dfield contains uninitialized data,
+		but row_build_index_entry_low() will not return NULL.
+		This bug is inherited from MySQL 5.7.5
+		commit b66ad511b61fffe75c58d0a607cdb837c6e6c821. */
+		return true;
+	}
+
+	const byte* dptr = NULL;
+	ulint	dlen = 0;
+	ulint	flen = 0;
+	double	tmp_mbr[SPDIMS * 2];
+	mem_heap_t*	temp_heap = NULL;
+
+	if (!dfield_is_ext(dfield2)) {
+		dptr = static_cast<const byte*>(dfield_get_data(dfield2));
+		dlen = dfield_get_len(dfield2);
+		ut_ad(dptr != &data_error);
+		goto write_mbr;
+	}
+
+	if (flag == ROW_BUILD_FOR_PURGE) {
+		const byte* ptr = static_cast<const byte*>(
+			dfield_get_data(dfield2));
+
+		switch (dfield_get_spatial_status(dfield2)) {
+		case SPATIAL_ONLY:
+			ut_ad(dfield_get_len(dfield2) == DATA_MBR_LEN);
+			break;
+
+		case SPATIAL_MIXED:
+			ptr += dfield_get_len(dfield2);
+			break;
+
+		case SPATIAL_UNKNOWN:
+			ut_ad(0);
+			/* fall through */
+		case SPATIAL_NONE:
+			/* Undo record is logged before
+			spatial index is created.*/
+			return false;
+		}
+
+		memcpy(mbr, ptr, DATA_MBR_LEN);
+		return true;
+	}
+
+	if (flag == ROW_BUILD_FOR_UNDO
+	    && dict_table_has_atomic_blobs(index->table)) {
+		/* For ROW_FORMAT=DYNAMIC or COMPRESSED, a prefix of
+		off-page records is stored in the undo log record (for
+		any column prefix indexes). For SPATIAL INDEX, we
+		must ignore this prefix. The full column value is
+		stored in the BLOB.  For non-spatial index, we would
+		have already fetched a necessary prefix of the BLOB,
+		available in the "ext" parameter.
+
+		Here, for SPATIAL INDEX, we are fetching the full
+		column, which is potentially wasting a lot of I/O,
+		memory, and possibly involving a concurrency problem,
+		similar to ones that existed before the introduction
+		of row_ext_t.
+
+		MDEV-11657 FIXME: write the MBR directly to the undo
+		log record, and avoid recomputing it here! */
+		flen = BTR_EXTERN_FIELD_REF_SIZE;
+		ut_ad(dfield_get_len(dfield2) >= BTR_EXTERN_FIELD_REF_SIZE);
+		dptr = static_cast<const byte*>(dfield_get_data(dfield2))
+			+ dfield_get_len(dfield2)
+			- BTR_EXTERN_FIELD_REF_SIZE;
+	} else {
+		flen = dfield_get_len(dfield2);
+		dptr = static_cast<const byte*>(dfield_get_data(dfield2));
+	}
+
+	temp_heap = mem_heap_create(1000);
+
+	dptr = btr_copy_externally_stored_field(
+		&dlen, dptr, ext ? ext->zip_size : space->zip_size(),
+		flen, temp_heap);
+
+write_mbr:
+	if (dlen <= GEO_DATA_HEADER_SIZE) {
+		for (uint i = 0; i < SPDIMS; i += 2) {
+			tmp_mbr[i] = DBL_MAX;
+			tmp_mbr[i + 1] = -DBL_MAX;
+		}
+	} else {
+		rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+				   uint(dlen - GEO_DATA_HEADER_SIZE),
+				   SPDIMS, tmp_mbr);
+	}
+
+	dfield_write_mbr(dfield, tmp_mbr);
+	if (temp_heap) {
+		mem_heap_free(temp_heap);
+	}
+
+	return true;
+}
+
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged
+@retval NULL if the externally stored columns in the clustered index record
+are unavailable and ext != NULL, or row is missing some needed columns. */
+dtuple_t*
+row_build_index_entry_low(
+/*======================*/
+	const dtuple_t*		row,	/*!< in: row which should be
+					inserted or purged */
+	const row_ext_t*	ext,	/*!< in: externally stored column
+					prefixes, or NULL */
+	const dict_index_t*	index,	/*!< in: index on the table */
+	mem_heap_t*		heap,	/*!< in,out: memory heap from which
+					the memory for the index entry
+					is allocated */
+	ulint			flag)	/*!< in: ROW_BUILD_NORMAL,
+					ROW_BUILD_FOR_PURGE
+                                        or ROW_BUILD_FOR_UNDO */
+{
+	dtuple_t*	entry;
+	ulint		entry_len;
+	ulint		i = 0;
+	ulint		num_v = 0;
+
+	entry_len = dict_index_get_n_fields(index);
+
+	if (flag == ROW_BUILD_FOR_INSERT && dict_index_is_clust(index)) {
+		num_v = dict_table_get_n_v_cols(index->table);
+		entry = dtuple_create_with_vcol(heap, entry_len, num_v);
+	} else {
+		entry = dtuple_create(heap, entry_len);
+	}
+
+	if (dict_index_is_ibuf(index)) {
+		dtuple_set_n_fields_cmp(entry, entry_len);
+		/* There may only be externally stored columns
+		in a clustered index B-tree of a user table. */
+		ut_a(!ext);
+	} else {
+		dtuple_set_n_fields_cmp(
+			entry, dict_index_get_n_unique_in_tree(index));
+		if (dict_index_is_spatial(index)) {
+			/* Set the MBR field */
+			if (!row_build_spatial_index_key(
+				    index, ext,
+				    dtuple_get_nth_field(entry, 0),
+				    dtuple_get_nth_field(
+					    row,
+					    dict_index_get_nth_field(index, i)
+					    ->col->ind), flag, heap)) {
+				return NULL;
+			}
+
+			i = 1;
+		}
+	}
+
+	for (; i < entry_len; i++) {
+		const dict_field_t& f = index->fields[i];
+		dfield_t* dfield = dtuple_get_nth_field(entry, i);
+
+		if (f.col->is_dropped()) {
+			ut_ad(index->is_primary());
+			ut_ad(index->is_instant());
+			ut_ad(!f.col->is_virtual());
+			dict_col_copy_type(f.col, &dfield->type);
+			if (f.col->is_nullable()) {
+				dfield_set_null(dfield);
+			} else {
+				dfield_set_data(dfield, field_ref_zero,
+						f.fixed_len);
+			}
+			continue;
+		}
+
+		const dfield_t* dfield2;
+
+		if (f.col->is_virtual()) {
+			const dict_v_col_t* v_col
+				= reinterpret_cast<const dict_v_col_t*>(f.col);
+
+			ut_ad(v_col->v_pos < dtuple_get_n_v_fields(row));
+			dfield2 = dtuple_get_nth_v_field(row, v_col->v_pos);
+
+			ut_ad(dfield_is_null(dfield2) ||
+			      dfield_get_len(dfield2) == 0 || dfield2->data);
+			ut_ad(!dfield_is_ext(dfield2));
+			if (UNIV_UNLIKELY(dfield2->type.mtype
+					  == DATA_MISSING)) {
+				ut_ad(flag == ROW_BUILD_FOR_PURGE);
+				return(NULL);
+			}
+		} else {
+			dfield2 = dtuple_get_nth_field(row, f.col->ind);
+			if (UNIV_UNLIKELY(dfield2->type.mtype
+					  == DATA_MISSING)) {
+				/* The field has not been initialized in
+				the row. This should be from
+				trx_undo_rec_get_partial_row(). */
+				return(NULL);
+			}
+
+			ut_ad(!(dfield2->type.prtype & DATA_VIRTUAL));
+		}
+
+		compile_time_assert(DATA_MISSING == 0);
+
+		*dfield = *dfield2;
+
+		if (dfield_is_null(dfield)) {
+			continue;
+		}
+
+		ut_ad(!(index->type & DICT_FTS));
+
+		ulint len = dfield_get_len(dfield);
+
+		if (f.prefix_len == 0
+		    && (!dfield_is_ext(dfield)
+			|| dict_index_is_clust(index))) {
+			/* The *dfield = *dfield2 above suffices for
+			columns that are stored in-page, or for
+			clustered index record columns that are not
+			part of a column prefix in the PRIMARY KEY. */
+			continue;
+		}
+
+		/* If the column is stored externally (off-page) in
+		the clustered index, it must be an ordering field in
+		the secondary index. If !atomic_blobs, the only way
+		we may have a secondary index pointing to a clustered
+		index record with an off-page column is when it is a
+		column prefix index. If atomic_blobs, also fully
+		indexed long columns may be stored off-page. */
+		ut_ad(f.col->ord_part);
+
+		if (ext && !f.col->is_virtual()) {
+			/* See if the column is stored externally. */
+			const byte*	buf = row_ext_lookup(ext, f.col->ind,
+							     &len);
+			if (UNIV_LIKELY_NULL(buf)) {
+				if (UNIV_UNLIKELY(buf == field_ref_zero)) {
+					return(NULL);
+				}
+				dfield_set_data(dfield, buf, len);
+			}
+
+			if (f.prefix_len == 0) {
+				/* If ROW_FORMAT=DYNAMIC or
+				ROW_FORMAT=COMPRESSED, we can have a
+				secondary index on an entire column
+				that is stored off-page in the
+				clustered index. As this is not a
+				prefix index (prefix_len == 0),
+				include the entire off-page column in
+				the secondary index record. */
+				continue;
+			}
+		} else if (dfield_is_ext(dfield)) {
+			/* This table is either in
+			(ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT)
+			or a purge record where the ordered part of
+			the field is not external.
+			In ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT,
+			the maximum column prefix
+			index length is 767 bytes, and the clustered
+			index record contains a 768-byte prefix of
+			each off-page column. */
+			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+			len -= BTR_EXTERN_FIELD_REF_SIZE;
+			dfield_set_len(dfield, len);
+		}
+
+		/* If a column prefix index, take only the prefix. */
+		if (f.prefix_len) {
+			len = dtype_get_at_most_n_mbchars(
+				f.col->prtype,
+				f.col->mbminlen, f.col->mbmaxlen,
+				f.prefix_len, len,
+				static_cast<char*>(dfield_get_data(dfield)));
+			dfield_set_len(dfield, len);
+		}
+	}
+
+	for (i = num_v; i--; ) {
+		ut_ad(index->is_primary());
+		ut_ad(flag == ROW_BUILD_FOR_INSERT);
+		dfield_t* dfield = dtuple_get_nth_v_field(entry, i);
+		const dict_v_col_t* v_col = dict_table_get_nth_v_col(
+			index->table, i);
+		ut_ad(!v_col->m_col.is_dropped());
+		ut_ad(v_col->v_pos < dtuple_get_n_v_fields(row));
+		const dfield_t* dfield2 = dtuple_get_nth_v_field(
+			row, v_col->v_pos);
+		ut_ad(dfield_is_null(dfield2) ||
+		      dfield_get_len(dfield2) == 0 || dfield2->data);
+		ut_ad(dfield2->type.mtype != DATA_MISSING);
+		*dfield = *dfield2;
+	}
+
+	return entry;
+}
+
+/** An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index, with possible indexing on ongoing
+addition of new virtual columns.
+@param[in]	type		ROW_COPY_POINTERS or ROW_COPY_DATA;
+@param[in]	index		clustered index
+@param[in]	rec		record in the clustered index
+@param[in]	offsets		rec_get_offsets(rec,index) or NULL
+@param[in]	col_table	table, to check which
+				externally stored columns
+				occur in the ordering columns
+				of an index, or NULL if
+				index->table should be
+				consulted instead
+@param[in]	defaults	default values of added/changed columns, or NULL
+@param[in]	add_v		new virtual columns added
+				along with new indexes
+@param[in]	col_map		mapping of old column
+				numbers to new ones, or NULL
+@param[in]	ext		cache of externally stored column
+				prefixes, or NULL
+@param[in]	heap		memory heap from which
+				the memory needed is allocated
+@return own: row built; */
+static inline
+dtuple_t*
+row_build_low(
+	ulint			type,
+	const dict_index_t*	index,
+	const rec_t*		rec,
+	const rec_offs*		offsets,
+	const dict_table_t*	col_table,
+	const dtuple_t*		defaults,
+	const dict_add_v_col_t*	add_v,
+	const ulint*		col_map,
+	row_ext_t**		ext,
+	mem_heap_t*		heap)
+{
+	const byte*		copy;
+	dtuple_t*		row;
+	ulint			n_ext_cols;
+	ulint*			ext_cols	= NULL; /* remove warning */
+	ulint			len;
+	byte*			buf;
+	ulint			j;
+	mem_heap_t*		tmp_heap	= NULL;
+	rec_offs		offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets_);
+
+	ut_ad(index != NULL);
+	ut_ad(rec != NULL);
+	ut_ad(heap != NULL);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(!col_map || col_table);
+
+	if (!offsets) {
+		offsets = rec_get_offsets(rec, index, offsets_,
+					  index->n_core_fields,
+					  ULINT_UNDEFINED, &tmp_heap);
+	} else {
+		ut_ad(rec_offs_validate(rec, index, offsets));
+	}
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	/* Some blob refs can be NULL during crash recovery before
+	trx_rollback_active() has completed execution, or when a concurrently
+	executing insert or update has committed the B-tree mini-transaction
+	but has not yet managed to restore the cursor position for writing
+	the big_rec. Note that the mini-transaction can be committed multiple
+	times, and the cursor restore can happen multiple times for single
+	insert or update statement.  */
+	ut_a(!rec_offs_any_null_extern(rec, offsets)
+	     || trx_sys.is_registered(current_trx(),
+				      row_get_rec_trx_id(rec, index,
+							 offsets)));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+	if (type != ROW_COPY_POINTERS) {
+		/* Take a copy of rec to heap */
+		buf = static_cast<byte*>(
+			mem_heap_alloc(heap, rec_offs_size(offsets)));
+
+		copy = rec_copy(buf, rec, offsets);
+	} else {
+		copy = rec;
+	}
+
+	n_ext_cols = rec_offs_n_extern(offsets);
+	if (n_ext_cols) {
+		ext_cols = static_cast<ulint*>(
+			mem_heap_alloc(heap, n_ext_cols * sizeof *ext_cols));
+	}
+
+	/* Avoid a debug assertion in rec_offs_validate(). */
+	rec_offs_make_valid(copy, index, true, const_cast<rec_offs*>(offsets));
+
+	if (!col_table) {
+		ut_ad(!col_map);
+		ut_ad(!defaults);
+		col_table = index->table;
+	}
+
+	if (defaults) {
+		ut_ad(col_map);
+		row = dtuple_copy(defaults, heap);
+		/* dict_table_copy_types() would set the fields to NULL */
+		for (ulint i = 0; i < dict_table_get_n_cols(col_table); i++) {
+			dict_col_copy_type(
+				dict_table_get_nth_col(col_table, i),
+				dfield_get_type(dtuple_get_nth_field(row, i)));
+		}
+	} else if (add_v != NULL) {
+		row = dtuple_create_with_vcol(
+			heap, dict_table_get_n_cols(col_table),
+			dict_table_get_n_v_cols(col_table) + add_v->n_v_col);
+		dict_table_copy_types(row, col_table);
+
+		for (ulint i = 0; i < add_v->n_v_col; i++) {
+			dict_col_copy_type(
+				&add_v->v_col[i].m_col,
+				dfield_get_type(dtuple_get_nth_v_field(
+					row, i + col_table->n_v_def)));
+		}
+	} else {
+		row = dtuple_create_with_vcol(
+			heap, dict_table_get_n_cols(col_table),
+			dict_table_get_n_v_cols(col_table));
+		dict_table_copy_types(row, col_table);
+	}
+
+	dtuple_set_info_bits(row, rec_get_info_bits(
+				     copy, rec_offs_comp(offsets)));
+
+	j = 0;
+
+	const dict_field_t* ind_field = index->fields;
+
+	for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) {
+		if (i == index->first_user_field()
+		    && rec_is_alter_metadata(rec, *index)) {
+			ut_ad(rec_offs_nth_extern(offsets, i));
+			ut_d(ulint len);
+			ut_d(rec_get_nth_field_offs(offsets, i, &len));
+			ut_ad(len == FIELD_REF_SIZE);
+			continue;
+		}
+
+		if (UNIV_UNLIKELY(ind_field
+				  >= &index->fields[index->n_fields])) {
+			ut_ad(rec_is_metadata(rec, *index));
+			continue;
+		}
+
+		const dict_col_t* col = dict_field_get_col(ind_field);
+
+		if ((ind_field++)->prefix_len) {
+			/* Column prefixes can only occur in key
+			fields, which cannot be stored externally. For
+			a column prefix, there should also be the full
+			field in the clustered index tuple. The row
+			tuple comprises full fields, not prefixes. */
+			ut_ad(!rec_offs_nth_extern(offsets, i));
+			continue;
+		}
+
+		if (col->is_dropped()) {
+			continue;
+		}
+
+		ulint	col_no = dict_col_get_no(col);
+
+		if (col_map) {
+			col_no = col_map[col_no];
+
+			if (col_no == ULINT_UNDEFINED) {
+				/* dropped column */
+				continue;
+			}
+		}
+
+		dfield_t*	dfield = dtuple_get_nth_field(row, col_no);
+
+		const void*	field = rec_get_nth_field(
+			copy, offsets, i, &len);
+		if (len == UNIV_SQL_DEFAULT) {
+			field = index->instant_field_value(i, &len);
+			if (field && type != ROW_COPY_POINTERS) {
+				field = mem_heap_dup(heap, field, len);
+			}
+		}
+		dfield_set_data(dfield, field, len);
+
+		if (rec_offs_nth_extern(offsets, i)) {
+			dfield_set_ext(dfield);
+
+			col = dict_table_get_nth_col(col_table, col_no);
+
+			if (col->ord_part) {
+				/* We will have to fetch prefixes of
+				externally stored columns that are
+				referenced by column prefixes. */
+				ext_cols[j++] = col_no;
+			}
+		}
+	}
+
+	rec_offs_make_valid(rec, index, true, const_cast<rec_offs*>(offsets));
+
+	ut_ad(dtuple_check_typed(row));
+
+	if (!ext) {
+		/* REDUNDANT and COMPACT formats store a local
+		768-byte prefix of each externally stored
+		column. No cache is needed.
+
+		During online table rebuild,
+		row_log_table_apply_delete_low()
+		may use a cache that was set up by
+		row_log_table_delete(). */
+
+	} else if (j) {
+		*ext = row_ext_create(j, ext_cols, *index->table, row,
+				      heap);
+	} else {
+		*ext = NULL;
+	}
+
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	return(row);
+}
+
+
+/*******************************************************************//**
+An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index.
+@return own: row built; see the NOTE below! */
+dtuple_t*
+row_build(
+/*======*/
+	ulint			type,	/*!< in: ROW_COPY_POINTERS or
+					ROW_COPY_DATA; the latter
+					copies also the data fields to
+					heap while the first only
+					places pointers to data fields
+					on the index page, and thus is
+					more efficient */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_t*		rec,	/*!< in: record in the clustered
+					index; NOTE: in the case
+					ROW_COPY_POINTERS the data
+					fields in the row will point
+					directly into this record,
+					therefore, the buffer page of
+					this record must be at least
+					s-latched and the latch held
+					as long as the row dtuple is used! */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec,index)
+					or NULL, in which case this function
+					will invoke rec_get_offsets() */
+	const dict_table_t*	col_table,
+					/*!< in: table, to check which
+					externally stored columns
+					occur in the ordering columns
+					of an index, or NULL if
+					index->table should be
+					consulted instead */
+	const dtuple_t*		defaults,
+					/*!< in: default values of
+					added and changed columns, or NULL */
+	const ulint*		col_map,/*!< in: mapping of old column
+					numbers to new ones, or NULL */
+	row_ext_t**		ext,	/*!< out, own: cache of
+					externally stored column
+					prefixes, or NULL */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					 the memory needed is allocated */
+{
+	return(row_build_low(type, index, rec, offsets, col_table,
+			     defaults, NULL, col_map, ext, heap));
+}
+
+/** An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index, with possible indexing on ongoing
+addition of new virtual columns.
+@param[in]	type		ROW_COPY_POINTERS or ROW_COPY_DATA;
+@param[in]	index		clustered index
+@param[in]	rec		record in the clustered index
+@param[in]	offsets		rec_get_offsets(rec,index) or NULL
+@param[in]	col_table	table, to check which
+				externally stored columns
+				occur in the ordering columns
+				of an index, or NULL if
+				index->table should be
+				consulted instead
+@param[in]	defaults	default values of added, changed columns, or NULL
+@param[in]	add_v		new virtual columns added
+				along with new indexes
+@param[in]	col_map		mapping of old column
+				numbers to new ones, or NULL
+@param[in]	ext		cache of externally stored column
+				prefixes, or NULL
+@param[in]	heap		memory heap from which
+				the memory needed is allocated
+@return own: row built; */
+dtuple_t*
+row_build_w_add_vcol(
+	ulint			type,
+	const dict_index_t*	index,
+	const rec_t*		rec,
+	const rec_offs*		offsets,
+	const dict_table_t*	col_table,
+	const dtuple_t*		defaults,
+	const dict_add_v_col_t*	add_v,
+	const ulint*		col_map,
+	row_ext_t**		ext,
+	mem_heap_t*		heap)
+{
+	return(row_build_low(type, index, rec, offsets, col_table,
+			     defaults, add_v, col_map, ext, heap));
+}
+
+/** Convert an index record to a data tuple.
+@tparam metadata whether the index->instant_field_value() needs to be accessed
+@tparam mblob 1 if rec_is_alter_metadata();
+2 if we want converted metadata corresponding to info_bits
+@param[in]	rec		index record
+@param[in]	index		index
+@param[in]	offsets		rec_get_offsets(rec, index)
+@param[out]	n_ext		number of externally stored columns
+@param[in,out]	heap		memory heap for allocations
+@param[in]	info_bits	(only used if mblob=2)
+@param[in]	pad		(only used if mblob=2)
+@return index entry built; does not set info_bits, and the data fields
+in the entry will point directly to rec */
+template<bool metadata, int mblob = 0>
+static inline
+dtuple_t*
+row_rec_to_index_entry_impl(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	const rec_offs*		offsets,
+	mem_heap_t*		heap,
+	ulint			info_bits = 0,
+	bool			pad = false)
+{
+	ut_ad(rec != NULL);
+	ut_ad(heap != NULL);
+	ut_ad(index != NULL);
+	ut_ad(!mblob || index->is_primary());
+	ut_ad(!mblob || !index->table->is_temporary());
+	ut_ad(!mblob || !dict_index_is_spatial(index));
+	compile_time_assert(!mblob || metadata);
+	compile_time_assert(mblob <= 2);
+	/* Because this function may be invoked by row0merge.cc
+	on a record whose header is in different format, the check
+	rec_offs_validate(rec, index, offsets) must be avoided here. */
+
+	const bool got = mblob == 2 && rec_is_alter_metadata(rec, *index);
+	ulint rec_len = rec_offs_n_fields(offsets);
+	if (mblob == 2) {
+		ut_ad(info_bits == REC_INFO_METADATA_ALTER
+		      || info_bits == REC_INFO_METADATA_ADD);
+		if (pad) {
+			ut_ad(rec_len <= ulint(index->n_fields + got));
+			rec_len = ulint(index->n_fields)
+				+ (info_bits == REC_INFO_METADATA_ALTER);
+		} else if (got) {
+			rec_len = std::min(rec_len,
+					   ulint(index->n_fields + got));
+		} else if (info_bits == REC_INFO_METADATA_ALTER) {
+			ut_ad(rec_len <= index->n_fields);
+			rec_len++;
+		}
+	} else {
+		ut_ad(info_bits == 0);
+		ut_ad(!pad);
+	}
+	dtuple_t* entry = dtuple_create(heap, rec_len);
+	dfield_t* dfield = entry->fields;
+
+	dtuple_set_n_fields_cmp(entry,
+				dict_index_get_n_unique_in_tree(index));
+	ut_ad(mblob == 2
+	      || rec_len == dict_index_get_n_fields(index) + uint(mblob == 1)
+	      /* a record for older SYS_INDEXES table
+	      (missing merge_threshold column) is acceptable. */
+	      || (!index->table->is_temporary()
+		  && index->table->id == DICT_INDEXES_ID
+		  && rec_len + 1 == dict_index_get_n_fields(index)));
+
+	ulint i;
+	for (i = 0; i < (mblob ? index->first_user_field() : rec_len);
+	     i++, dfield++) {
+		dict_col_copy_type(dict_index_get_nth_col(index, i),
+				   &dfield->type);
+		if (!mblob
+		    && dict_index_is_spatial(index)
+		    && DATA_GEOMETRY_MTYPE(dfield->type.mtype)) {
+			dfield->type.prtype |= DATA_GIS_MBR;
+		}
+
+		ulint len;
+		const byte* field = metadata
+			? rec_get_nth_cfield(rec, index, offsets, i, &len)
+			: rec_get_nth_field(rec, offsets, i, &len);
+
+		dfield_set_data(dfield, field, len);
+
+		if (rec_offs_nth_extern(offsets, i)) {
+			dfield_set_ext(dfield);
+		}
+	}
+
+	if (mblob) {
+		ulint len;
+		const byte* field;
+		ulint j = i;
+
+		if (mblob == 2) {
+			const bool want = info_bits == REC_INFO_METADATA_ALTER;
+			if (got == want) {
+				if (got) {
+					goto copy_metadata;
+				}
+			} else {
+				if (want) {
+					/* Allocate a placeholder for
+					adding metadata in an update. */
+					len = FIELD_REF_SIZE;
+					field = static_cast<byte*>(
+						mem_heap_zalloc(heap, len));
+					/* In reality there is one fewer
+					field present in the record. */
+					rec_len--;
+					goto init_metadata;
+				}
+
+				/* Skip the undesired metadata blob
+				(for example, when rolling back an
+				instant ALTER TABLE). */
+				i++;
+			}
+			goto copy_user_fields;
+		}
+copy_metadata:
+		ut_ad(rec_offs_nth_extern(offsets, i));
+		field = rec_get_nth_field(rec, offsets, i++, &len);
+init_metadata:
+		dfield->type.metadata_blob_init();
+		ut_ad(len == FIELD_REF_SIZE);
+		dfield_set_data(dfield, field, len);
+		dfield_set_ext(dfield++);
+copy_user_fields:
+		for (; i < rec_len; i++, dfield++) {
+			dict_col_copy_type(dict_index_get_nth_col(index, j++),
+					   &dfield->type);
+			if (mblob == 2 && pad
+			    && i >= rec_offs_n_fields(offsets)) {
+				field = index->instant_field_value(j - 1,
+								   &len);
+				dfield_set_data(dfield, field, len);
+				continue;
+			}
+
+			field = rec_get_nth_field(rec, offsets, i, &len);
+			dfield_set_data(dfield, field, len);
+
+			if (rec_offs_nth_extern(offsets, i)) {
+				dfield_set_ext(dfield);
+			}
+		}
+	}
+
+	if (mblob == 2) {
+		ulint n_fields = ulint(dfield - entry->fields);
+		ut_ad(entry->n_fields >= n_fields);
+		entry->n_fields = n_fields;
+	}
+	ut_ad(dfield == entry->fields + entry->n_fields);
+	ut_ad(dtuple_check_typed(entry));
+	return entry;
+}
+
+/** Convert an index record to a data tuple.
+@param[in]	rec	index record
+@param[in]	index	index
+@param[in]	offsets	rec_get_offsets(rec, index)
+@param[in,out]	heap	memory heap for allocations */
+dtuple_t*
+row_rec_to_index_entry_low(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	const rec_offs*		offsets,
+	mem_heap_t*		heap)
+{
+	return row_rec_to_index_entry_impl<false>(rec, index, offsets, heap);
+}
+
+/*******************************************************************//**
+Converts an index record to a typed data tuple. NOTE that externally
+stored (often big) fields are NOT copied to heap.
+@return own: index entry built */
+dtuple_t*
+row_rec_to_index_entry(
+/*===================*/
+	const rec_t*		rec,	/*!< in: record in the index */
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec) */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory needed is allocated */
+{
+	ut_ad(rec != NULL);
+	ut_ad(heap != NULL);
+	ut_ad(index != NULL);
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	/* Take a copy of rec to heap */
+	const rec_t* copy_rec = rec_copy(
+		static_cast<byte*>(mem_heap_alloc(heap,
+						  rec_offs_size(offsets))),
+		rec, offsets);
+
+	rec_offs_make_valid(copy_rec, index, true,
+			    const_cast<rec_offs*>(offsets));
+
+	dtuple_t* entry = rec_is_alter_metadata(copy_rec, *index)
+		? row_rec_to_index_entry_impl<true,1>(
+			copy_rec, index, offsets, heap)
+		: row_rec_to_index_entry_impl<true>(
+			copy_rec, index, offsets, heap);
+
+	rec_offs_make_valid(rec, index, true,
+			    const_cast<rec_offs*>(offsets));
+
+	dtuple_set_info_bits(entry,
+			     rec_get_info_bits(rec, rec_offs_comp(offsets)));
+
+	return(entry);
+}
+
+/** Convert a metadata record to a data tuple.
+@param[in]	rec		metadata record
+@param[in]	index		clustered index after instant ALTER TABLE
+@param[in]	offsets		rec_get_offsets(rec)
+@param[in,out]	heap		memory heap for allocations
+@param[in]	info_bits	the info_bits after an update
+@param[in]	pad		whether to pad to index->n_fields */
+dtuple_t*
+row_metadata_to_tuple(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	const rec_offs*		offsets,
+	mem_heap_t*		heap,
+	ulint			info_bits,
+	bool			pad)
+{
+	ut_ad(info_bits == REC_INFO_METADATA_ALTER
+	      || info_bits == REC_INFO_METADATA_ADD);
+	ut_ad(rec_is_metadata(rec, *index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	const rec_t* copy_rec = rec_copy(
+		static_cast<byte*>(mem_heap_alloc(heap,
+						  rec_offs_size(offsets))),
+		rec, offsets);
+
+	rec_offs_make_valid(copy_rec, index, true,
+			    const_cast<rec_offs*>(offsets));
+
+	dtuple_t* entry = info_bits == REC_INFO_METADATA_ALTER
+		|| rec_is_alter_metadata(copy_rec, *index)
+		? row_rec_to_index_entry_impl<true,2>(
+			copy_rec, index, offsets, heap, info_bits, pad)
+		: row_rec_to_index_entry_impl<true>(
+			copy_rec, index, offsets, heap);
+
+	rec_offs_make_valid(rec, index, true,
+			    const_cast<rec_offs*>(offsets));
+
+	dtuple_set_info_bits(entry, info_bits);
+	return entry;
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record.
+@return own: row reference built; see the NOTE below! */
+dtuple_t*
+row_build_row_ref(
+/*==============*/
+	ulint		type,	/*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+				the former copies also the data fields to
+				heap, whereas the latter only places pointers
+				to data fields on the index page */
+	dict_index_t*	index,	/*!< in: secondary index */
+	const rec_t*	rec,	/*!< in: record in the index;
+				NOTE: in the case ROW_COPY_POINTERS
+				the data fields in the row will point
+				directly into this record, therefore,
+				the buffer page of this record must be
+				at least s-latched and the latch held
+				as long as the row reference is used! */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
+				needed is allocated */
+{
+	dict_table_t*	table;
+	dict_index_t*	clust_index;
+	dfield_t*	dfield;
+	dtuple_t*	ref;
+	const byte*	field;
+	ulint		len;
+	ulint		ref_len;
+	ulint		pos;
+	byte*		buf;
+	ulint		clust_col_prefix_len;
+	ulint		i;
+	mem_heap_t*	tmp_heap	= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(index != NULL);
+	ut_ad(rec != NULL);
+	ut_ad(heap != NULL);
+	ut_ad(!dict_index_is_clust(index));
+
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  ULINT_UNDEFINED, &tmp_heap);
+	/* Secondary indexes must not contain externally stored columns. */
+	ut_ad(!rec_offs_any_extern(offsets));
+
+	if (type == ROW_COPY_DATA) {
+		/* Take a copy of rec to heap */
+
+		buf = static_cast<byte*>(
+			mem_heap_alloc(heap, rec_offs_size(offsets)));
+
+		rec = rec_copy(buf, rec, offsets);
+		rec_offs_make_valid(rec, index, true, offsets);
+	}
+
+	table = index->table;
+
+	clust_index = dict_table_get_first_index(table);
+
+	ref_len = dict_index_get_n_unique(clust_index);
+
+	ref = dtuple_create(heap, ref_len);
+
+	dict_index_copy_types(ref, clust_index, ref_len);
+
+	for (i = 0; i < ref_len; i++) {
+		dfield = dtuple_get_nth_field(ref, i);
+
+		pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+		ut_a(pos != ULINT_UNDEFINED);
+
+		ut_ad(!rec_offs_nth_default(offsets, pos));
+		field = rec_get_nth_field(rec, offsets, pos, &len);
+
+		dfield_set_data(dfield, field, len);
+
+		/* If the primary key contains a column prefix, then the
+		secondary index may contain a longer prefix of the same
+		column, or the full column, and we must adjust the length
+		accordingly. */
+
+		clust_col_prefix_len = dict_index_get_nth_field(
+			clust_index, i)->prefix_len;
+
+		if (clust_col_prefix_len > 0) {
+			if (len != UNIV_SQL_NULL) {
+
+				const dtype_t*	dtype
+					= dfield_get_type(dfield);
+
+				dfield_set_len(dfield,
+					       dtype_get_at_most_n_mbchars(
+						       dtype->prtype,
+						       dtype->mbminlen,
+						       dtype->mbmaxlen,
+						       clust_col_prefix_len,
+						       len, (char*) field));
+			}
+		}
+	}
+
+	ut_ad(dtuple_check_typed(ref));
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	return(ref);
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+void
+row_build_row_ref_in_tuple(
+/*=======================*/
+	dtuple_t*		ref,	/*!< in/out: row reference built;
+					see the NOTE below! */
+	const rec_t*		rec,	/*!< in: record in the index;
+					NOTE: the data fields in ref
+					will point directly into this
+					record, therefore, the buffer
+					page of this record must be at
+					least s-latched and the latch
+					held as long as the row
+					reference is used! */
+	const dict_index_t*	index,	/*!< in: secondary index */
+	rec_offs*		offsets)/*!< in: rec_get_offsets(rec, index)
+					or NULL */
+{
+	const dict_index_t*	clust_index;
+	dfield_t*		dfield;
+	const byte*		field;
+	ulint			len;
+	ulint			ref_len;
+	ulint			pos;
+	ulint			clust_col_prefix_len;
+	ulint			i;
+	mem_heap_t*		heap		= NULL;
+	rec_offs		offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets_);
+
+	ut_ad(!dict_index_is_clust(index));
+	ut_a(index->table);
+
+	clust_index = dict_table_get_first_index(index->table);
+	ut_ad(clust_index);
+
+	if (!offsets) {
+		offsets = rec_get_offsets(rec, index, offsets_,
+					  index->n_core_fields,
+					  ULINT_UNDEFINED, &heap);
+	} else {
+		ut_ad(rec_offs_validate(rec, index, offsets));
+	}
+
+	/* Secondary indexes must not contain externally stored columns. */
+	ut_ad(!rec_offs_any_extern(offsets));
+	ref_len = dict_index_get_n_unique(clust_index);
+
+	ut_ad(ref_len == dtuple_get_n_fields(ref));
+
+	dict_index_copy_types(ref, clust_index, ref_len);
+
+	for (i = 0; i < ref_len; i++) {
+		dfield = dtuple_get_nth_field(ref, i);
+
+		pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+		ut_a(pos != ULINT_UNDEFINED);
+
+		ut_ad(!rec_offs_nth_default(offsets, pos));
+		field = rec_get_nth_field(rec, offsets, pos, &len);
+
+		dfield_set_data(dfield, field, len);
+
+		/* If the primary key contains a column prefix, then the
+		secondary index may contain a longer prefix of the same
+		column, or the full column, and we must adjust the length
+		accordingly. */
+
+		clust_col_prefix_len = dict_index_get_nth_field(
+			clust_index, i)->prefix_len;
+
+		if (clust_col_prefix_len > 0) {
+			if (len != UNIV_SQL_NULL) {
+
+				const dtype_t*	dtype
+					= dfield_get_type(dfield);
+
+				dfield_set_len(dfield,
+					       dtype_get_at_most_n_mbchars(
+						       dtype->prtype,
+						       dtype->mbminlen,
+						       dtype->mbmaxlen,
+						       clust_col_prefix_len,
+						       len, (char*) field));
+			}
+		}
+	}
+
+	ut_ad(dtuple_check_typed(ref));
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/***************************************************************//**
+Searches the clustered index record for a row, if we have the row reference.
+@return TRUE if found */
+bool
+row_search_on_row_ref(
+/*==================*/
+	btr_pcur_t*		pcur,	/*!< out: persistent cursor, which must
+					be closed by the caller */
+	btr_latch_mode		mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	const dict_table_t*	table,	/*!< in: table */
+	const dtuple_t*		ref,	/*!< in: row reference */
+	mtr_t*			mtr)	/*!< in/out: mtr */
+{
+	ut_ad(dtuple_check_typed(ref));
+
+	dict_index_t *index = dict_table_get_first_index(table);
+	btr_pcur_init(pcur);
+	pcur->btr_cur.page_cur.index = index;
+
+	if (UNIV_UNLIKELY(ref->info_bits != 0)) {
+		ut_ad(ref->is_metadata());
+		ut_ad(ref->n_fields <= index->n_uniq);
+		if (pcur->open_leaf(true, index, mode, mtr) != DB_SUCCESS
+		    || !btr_pcur_move_to_next_user_rec(pcur, mtr)) {
+			return false;
+		}
+		/* We do not necessarily have index->is_instant() here,
+		because we could be executing a rollback of an
+		instant ADD COLUMN operation. The function
+		rec_is_metadata() asserts index->is_instant();
+		we do not want to call it here. */
+		return rec_get_info_bits(btr_pcur_get_rec(pcur),
+					 dict_table_is_comp(index->table))
+			& REC_INFO_MIN_REC_FLAG;
+	} else {
+		ut_a(ref->n_fields == index->n_uniq);
+		if (btr_pcur_open(ref, PAGE_CUR_LE, mode, pcur, mtr)
+		    != DB_SUCCESS) {
+			return false;
+		}
+	}
+
+	return !page_rec_is_infimum(btr_pcur_get_rec(pcur))
+		&& btr_pcur_get_low_match(pcur) == dtuple_get_n_fields(ref);
+}
+
+/*********************************************************************//**
+Fetches the clustered index record for a secondary index record. The latches
+on the secondary index record are preserved.
+@return record or NULL, if no record found */
+rec_t*
+row_get_clust_rec(
+/*==============*/
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	const rec_t*	rec,	/*!< in: record in a secondary index */
+	dict_index_t*	index,	/*!< in: secondary index */
+	dict_index_t**	clust_index,/*!< out: clustered index */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	ref;
+	dict_table_t*	table;
+	btr_pcur_t	pcur;
+
+	ut_ad(!dict_index_is_clust(index));
+
+	table = index->table;
+
+	heap = mem_heap_create(256);
+
+	ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, heap);
+
+	auto found = row_search_on_row_ref(&pcur, mode, table, ref, mtr);
+
+	mem_heap_free(heap);
+
+	*clust_index = dict_table_get_first_index(table);
+	return found ? btr_pcur_get_rec(&pcur) : nullptr;
+}
+
+/***************************************************************//**
+Searches an index record.
+@return whether the record was found or buffered */
+enum row_search_result
+row_search_index_entry(
+/*===================*/
+	const dtuple_t*	entry,	/*!< in: index entry */
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	btr_pcur_t*	pcur,	/*!< in/out: persistent cursor, which must
+				be closed by the caller */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint	n_fields;
+	ulint	low_match;
+	rec_t*	rec;
+
+	ut_ad(dtuple_check_typed(entry));
+
+	if (btr_pcur_open(entry, PAGE_CUR_LE, mode, pcur, mtr) != DB_SUCCESS) {
+		return ROW_NOT_FOUND;
+	}
+
+	switch (btr_pcur_get_btr_cur(pcur)->flag) {
+	case BTR_CUR_DELETE_REF:
+		ut_ad(!(~mode & BTR_DELETE));
+		return(ROW_NOT_DELETED_REF);
+
+	case BTR_CUR_DEL_MARK_IBUF:
+	case BTR_CUR_DELETE_IBUF:
+	case BTR_CUR_INSERT_TO_IBUF:
+		return(ROW_BUFFERED);
+
+	case BTR_CUR_HASH:
+	case BTR_CUR_HASH_FAIL:
+	case BTR_CUR_BINARY:
+		break;
+	}
+
+	low_match = btr_pcur_get_low_match(pcur);
+
+	rec = btr_pcur_get_rec(pcur);
+
+	n_fields = dtuple_get_n_fields(entry);
+
+	if (page_rec_is_infimum(rec)) {
+
+		return(ROW_NOT_FOUND);
+	} else if (low_match != n_fields) {
+
+		return(ROW_NOT_FOUND);
+	}
+
+	return(ROW_FOUND);
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_INT using "prtype" and writes the result to "buf".
+If the data is in unknown format, then nothing is written to "buf",
+0 is returned and "format_in_hex" is set to TRUE, otherwise
+"format_in_hex" is left untouched.
+Not more than "buf_size" bytes are written to "buf".
+The result is always '\0'-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating '\0').
+@return number of bytes that were written */
+static
+ulint
+row_raw_format_int(
+/*===============*/
+	const char*	data,		/*!< in: raw data */
+	ulint		data_len,	/*!< in: raw data length
+					in bytes */
+	ulint		prtype,		/*!< in: precise type */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size,	/*!< in: output buffer size
+					in bytes */
+	ibool*		format_in_hex)	/*!< out: should the data be
+					formatted in hex */
+{
+	ulint	ret;
+
+	if (data_len <= sizeof(ib_uint64_t)) {
+
+		ib_uint64_t	value;
+		ibool		unsigned_type = prtype & DATA_UNSIGNED;
+
+		value = mach_read_int_type(
+			(const byte*) data, data_len, unsigned_type);
+
+		ret = (ulint) snprintf(
+			buf, buf_size,
+			unsigned_type ? "%llu" : "%lld", (longlong) value)+1;
+	} else {
+
+		*format_in_hex = TRUE;
+		ret = 0;
+	}
+
+	return(ut_min(ret, buf_size));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "prtype" and writes the
+result to "buf".
+If the data is in binary format, then nothing is written to "buf",
+0 is returned and "format_in_hex" is set to TRUE, otherwise
+"format_in_hex" is left untouched.
+Not more than "buf_size" bytes are written to "buf".
+The result is always '\0'-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating '\0').
+@return number of bytes that were written */
+static
+ulint
+row_raw_format_str(
+/*===============*/
+	const char*	data,		/*!< in: raw data */
+	ulint		data_len,	/*!< in: raw data length
+					in bytes */
+	ulint		prtype,		/*!< in: precise type */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size,	/*!< in: output buffer size
+					in bytes */
+	ibool*		format_in_hex)	/*!< out: should the data be
+					formatted in hex */
+{
+	ulint	charset_coll;
+
+	if (buf_size == 0) {
+
+		return(0);
+	}
+
+	/* we assume system_charset_info is UTF-8 */
+
+	charset_coll = dtype_get_charset_coll(prtype);
+
+	if (UNIV_LIKELY(dtype_is_utf8(prtype))) {
+
+		return(ut_str_sql_format(data, data_len, buf, buf_size));
+	}
+	/* else */
+
+	if (charset_coll == DATA_MYSQL_BINARY_CHARSET_COLL) {
+
+		*format_in_hex = TRUE;
+		return(0);
+	}
+	/* else */
+
+	return(innobase_raw_format(data, data_len, charset_coll,
+					  buf, buf_size));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) using
+"dict_field" and writes the result to "buf".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size is positive) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+ulint
+row_raw_format(
+/*===========*/
+	const char*		data,		/*!< in: raw data */
+	ulint			data_len,	/*!< in: raw data length
+						in bytes */
+	const dict_field_t*	dict_field,	/*!< in: index field */
+	char*			buf,		/*!< out: output buffer */
+	ulint			buf_size)	/*!< in: output buffer size
+						in bytes */
+{
+	ulint	mtype;
+	ulint	prtype;
+	ulint	ret;
+	ibool	format_in_hex;
+
+	ut_ad(data_len != UNIV_SQL_DEFAULT);
+
+	if (buf_size == 0) {
+
+		return(0);
+	}
+
+	if (data_len == UNIV_SQL_NULL) {
+
+		ret = snprintf((char*) buf, buf_size, "NULL") + 1;
+
+		return(ut_min(ret, buf_size));
+	}
+
+	mtype = dict_field->col->mtype;
+	prtype = dict_field->col->prtype;
+
+	format_in_hex = FALSE;
+
+	switch (mtype) {
+	case DATA_INT:
+
+		ret = row_raw_format_int(data, data_len, prtype,
+					 buf, buf_size, &format_in_hex);
+		if (format_in_hex) {
+
+			goto format_in_hex;
+		}
+		break;
+	case DATA_CHAR:
+	case DATA_VARCHAR:
+	case DATA_MYSQL:
+	case DATA_VARMYSQL:
+
+		ret = row_raw_format_str(data, data_len, prtype,
+					 buf, buf_size, &format_in_hex);
+		if (format_in_hex) {
+
+			goto format_in_hex;
+		}
+
+		break;
+	/* XXX support more data types */
+	default:
+	format_in_hex:
+
+		if (UNIV_LIKELY(buf_size > 2)) {
+
+			memcpy(buf, "0x", 2);
+			buf += 2;
+			buf_size -= 2;
+			ret = 2 + ut_raw_to_hex(data, data_len,
+						buf, buf_size);
+		} else {
+
+			buf[0] = '\0';
+			ret = 1;
+		}
+	}
+
+	return(ret);
+}
+
+#ifdef UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT
+
+#ifdef HAVE_UT_CHRONO_T
+
+void
+test_row_raw_format_int()
+{
+	ulint	ret;
+	char	buf[128];
+	ibool	format_in_hex;
+	ulint	i;
+
+#define CALL_AND_TEST(data, data_len, prtype, buf, buf_size,\
+		      ret_expected, buf_expected, format_in_hex_expected)\
+	do {\
+		ibool	ok = TRUE;\
+		ulint	i;\
+		memset(buf, 'x', 10);\
+		buf[10] = '\0';\
+		format_in_hex = FALSE;\
+		fprintf(stderr, "TESTING \"\\x");\
+		for (i = 0; i < data_len; i++) {\
+			fprintf(stderr, "%02hhX", data[i]);\
+		}\
+		fprintf(stderr, "\", %lu, %lu, %lu\n",\
+                        (ulint) data_len, (ulint) prtype,\
+			(ulint) buf_size);\
+		ret = row_raw_format_int(data, data_len, prtype,\
+					 buf, buf_size, &format_in_hex);\
+		if (ret != ret_expected) {\
+			fprintf(stderr, "expected ret %lu, got %lu\n",\
+				(ulint) ret_expected, ret);\
+			ok = FALSE;\
+                }\
+                if (strcmp((char*) buf, buf_expected) != 0) {\
+                        fprintf(stderr, "expected buf \"%s\", got \"%s\"\n",\
+                                buf_expected, buf);\
+                        ok = FALSE;\
+                }\
+                if (format_in_hex != format_in_hex_expected) {\
+                        fprintf(stderr, "expected format_in_hex %d, got %d\n",\
+                                (int) format_in_hex_expected,\
+				(int) format_in_hex);\
+                        ok = FALSE;\
+                }\
+                if (ok) {\
+                        fprintf(stderr, "OK: %lu, \"%s\" %d\n\n",\
+                                (ulint) ret, buf, (int) format_in_hex);\
+                } else {\
+                        return;\
+                }\
+        } while (0)
+
+#if 1
+	/* min values for signed 1-8 byte integers */
+
+	CALL_AND_TEST("\x00", 1, 0,
+		      buf, sizeof(buf), 5, "-128", 0);
+
+	CALL_AND_TEST("\x00\x00", 2, 0,
+		      buf, sizeof(buf), 7, "-32768", 0);
+
+	CALL_AND_TEST("\x00\x00\x00", 3, 0,
+		      buf, sizeof(buf), 9, "-8388608", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00", 4, 0,
+		      buf, sizeof(buf), 12, "-2147483648", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, 0,
+		      buf, sizeof(buf), 14, "-549755813888", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, 0,
+		      buf, sizeof(buf), 17, "-140737488355328", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, 0,
+		      buf, sizeof(buf), 19, "-36028797018963968", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, 0,
+		      buf, sizeof(buf), 21, "-9223372036854775808", 0);
+
+	/* min values for unsigned 1-8 byte integers */
+
+	CALL_AND_TEST("\x00", 1, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00", 2, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00", 3, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00", 4, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	/* max values for signed 1-8 byte integers */
+
+	CALL_AND_TEST("\xFF", 1, 0,
+		      buf, sizeof(buf), 4, "127", 0);
+
+	CALL_AND_TEST("\xFF\xFF", 2, 0,
+		      buf, sizeof(buf), 6, "32767", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF", 3, 0,
+		      buf, sizeof(buf), 8, "8388607", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, 0,
+		      buf, sizeof(buf), 11, "2147483647", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, 0,
+		      buf, sizeof(buf), 13, "549755813887", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, 0,
+		      buf, sizeof(buf), 16, "140737488355327", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, 0,
+		      buf, sizeof(buf), 18, "36028797018963967", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, 0,
+		      buf, sizeof(buf), 20, "9223372036854775807", 0);
+
+	/* max values for unsigned 1-8 byte integers */
+
+	CALL_AND_TEST("\xFF", 1, DATA_UNSIGNED,
+		      buf, sizeof(buf), 4, "255", 0);
+
+	CALL_AND_TEST("\xFF\xFF", 2, DATA_UNSIGNED,
+		      buf, sizeof(buf), 6, "65535", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF", 3, DATA_UNSIGNED,
+		      buf, sizeof(buf), 9, "16777215", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, DATA_UNSIGNED,
+		      buf, sizeof(buf), 11, "4294967295", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, DATA_UNSIGNED,
+		      buf, sizeof(buf), 14, "1099511627775", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, DATA_UNSIGNED,
+		      buf, sizeof(buf), 16, "281474976710655", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, DATA_UNSIGNED,
+		      buf, sizeof(buf), 18, "72057594037927935", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, DATA_UNSIGNED,
+		      buf, sizeof(buf), 21, "18446744073709551615", 0);
+
+	/* some random values */
+
+	CALL_AND_TEST("\x52", 1, 0,
+		      buf, sizeof(buf), 4, "-46", 0);
+
+	CALL_AND_TEST("\x0E", 1, DATA_UNSIGNED,
+		      buf, sizeof(buf), 3, "14", 0);
+
+	CALL_AND_TEST("\x62\xCE", 2, 0,
+		      buf, sizeof(buf), 6, "-7474", 0);
+
+	CALL_AND_TEST("\x29\xD6", 2, DATA_UNSIGNED,
+		      buf, sizeof(buf), 6, "10710", 0);
+
+	CALL_AND_TEST("\x7F\xFF\x90", 3, 0,
+		      buf, sizeof(buf), 5, "-112", 0);
+
+	CALL_AND_TEST("\x00\xA1\x16", 3, DATA_UNSIGNED,
+		      buf, sizeof(buf), 6, "41238", 0);
+
+	CALL_AND_TEST("\x7F\xFF\xFF\xF7", 4, 0,
+		      buf, sizeof(buf), 3, "-9", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x5C", 4, DATA_UNSIGNED,
+		      buf, sizeof(buf), 3, "92", 0);
+
+	CALL_AND_TEST("\x7F\xFF\xFF\xFF\xFF\xFF\xDC\x63", 8, 0,
+		      buf, sizeof(buf), 6, "-9117", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x01\x64\x62", 8, DATA_UNSIGNED,
+		      buf, sizeof(buf), 6, "91234", 0);
+#endif
+
+	/* speed test */
+
+	ut_chrono_t	ch(__func__);
+
+	for (i = 0; i < 1000000; i++) {
+		row_raw_format_int("\x23", 1,
+				   0, buf, sizeof(buf),
+				   &format_in_hex);
+		row_raw_format_int("\x23", 1,
+				   DATA_UNSIGNED, buf, sizeof(buf),
+				   &format_in_hex);
+
+		row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8,
+				   0, buf, sizeof(buf),
+				   &format_in_hex);
+		row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8,
+				   DATA_UNSIGNED, buf, sizeof(buf),
+				   &format_in_hex);
+	}
+}
+
+#endif /* HAVE_UT_CHRONO_T */
+
+#endif /* UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT */
diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc
new file mode 100644
index 00000000..6c76dd91
--- /dev/null
+++ b/storage/innobase/row/row0sel.cc
@@ -0,0 +1,6947 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/***************************************************//**
+@file row/row0sel.cc
+Select
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0sel.h"
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0trx.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "gis0rtree.h"
+#include "mach0data.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "row0vers.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "eval0eval.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+#include "row0mysql.h"
+#include "buf0lru.h"
+#include "srv0srv.h"
+#include "srv0mon.h"
+#include "sql_error.h"
+#ifdef WITH_WSREP
+#include "mysql/service_wsrep.h" /* For wsrep_thd_skip_locking */
+#endif
+
+/* Maximum number of rows to prefetch; MySQL interface has another parameter */
+#define SEL_MAX_N_PREFETCH	16
+
+/* Number of rows fetched, after which to start prefetching; MySQL interface
+has another parameter */
+#define SEL_PREFETCH_LIMIT	1
+
+/* When a select has accessed about this many pages, it returns control back
+to que_run_threads: this is to allow canceling runaway queries */
+
+#define SEL_COST_LIMIT	100
+
+/* Flags for search shortcut */
+#define SEL_FOUND	0
+#define	SEL_EXHAUSTED	1
+#define SEL_RETRY	2
+
+/********************************************************************//**
+Returns TRUE if the user-defined column in a secondary index record
+is alphabetically the same as the corresponding BLOB column in the clustered
+index record.
+NOTE: the comparison is NOT done as a binary comparison, but character
+fields are compared with collation!
+@return whether the columns are equal */
+static
+bool
+row_sel_sec_rec_is_for_blob(
+/*========================*/
+	ulint		mtype,		/*!< in: main type */
+	ulint		prtype,		/*!< in: precise type */
+	ulint		mbminlen,	/*!< in: minimum length of
+					a character, in bytes */
+	ulint		mbmaxlen,	/*!< in: maximum length of
+					a character, in bytes */
+	const byte*	clust_field,	/*!< in: the locally stored part of
+					the clustered index column, including
+					the BLOB pointer; the clustered
+					index record must be covered by
+					a lock or a page latch to protect it
+					against deletion (rollback or purge) */
+	ulint		clust_len,	/*!< in: length of clust_field */
+	const byte*	sec_field,	/*!< in: column in secondary index */
+	ulint		sec_len,	/*!< in: length of sec_field */
+	ulint		prefix_len,	/*!< in: index column prefix length
+					in bytes, or 0 for full column */
+	dict_table_t*	table)		/*!< in: table */
+{
+	ulint	len;
+	byte	buf[REC_VERSION_56_MAX_INDEX_COL_LEN + 1];
+
+	/* This function should never be invoked on tables in
+	ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT, because they
+	should always contain enough prefix in the clustered index record. */
+	ut_ad(dict_table_has_atomic_blobs(table));
+	ut_a(clust_len >= BTR_EXTERN_FIELD_REF_SIZE);
+	ut_ad(!prefix_len || prefix_len >= sec_len);
+	ut_a(prefix_len <= sizeof buf);
+
+	if (!memcmp(clust_field + clust_len - BTR_EXTERN_FIELD_REF_SIZE,
+		    field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)) {
+		/* The externally stored field was not written yet.
+		This record should only be seen by
+		trx_rollback_recovered() or any
+		TRX_ISO_READ_UNCOMMITTED transactions. */
+		return false;
+	}
+
+	len = btr_copy_externally_stored_field_prefix(
+		buf, prefix_len ? prefix_len : sizeof buf,
+		table->space->zip_size(),
+		clust_field, clust_len);
+
+	if (len == 0) {
+		/* The BLOB was being deleted as the server crashed.
+		There should not be any secondary index records
+		referring to this clustered index record, because
+		btr_free_externally_stored_field() is called after all
+		secondary index entries of the row have been purged. */
+		return false;
+	}
+
+	if (prefix_len) {
+		len = dtype_get_at_most_n_mbchars(prtype, mbminlen, mbmaxlen,
+						  prefix_len, len,
+						  reinterpret_cast<const char*>
+						  (buf));
+	} else if (len >= sizeof buf) {
+		ut_ad("too long column" == 0);
+		return false;
+	}
+
+	return !cmp_data(mtype, prtype, false, buf, len, sec_field, sec_len);
+}
+
+/** Function to read the secondary spatial index, calculate
+the minimum bounding rectangle for clustered index record
+and secondary index record and compare it.
+@param sec_rec		secondary index record
+@param sec_index	spatial secondary index
+@param clust_rec	clustered index record
+@param clust_index	clustered index
+@retval DB_SUCCESS_LOCKED_REC if the secondary record is equal to the
+	corresponding fields in the clustered record, when compared with
+	collation;
+@retval DB_SUCCESS if not equal */
+static
+dberr_t
+row_sel_spatial_sec_rec_is_for_clust_rec(
+  const rec_t *sec_rec, const dict_index_t *sec_index,
+  const rec_t *clust_rec, dict_index_t *clust_index)
+{
+  mem_heap_t *heap= mem_heap_create(256);
+  rec_offs clust_offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *clust_offs= clust_offsets_;
+  ulint clust_len;
+
+  rec_offs_init(clust_offsets_);
+  ulint clust_pos= dict_col_get_clust_pos(
+    dict_index_get_nth_col(sec_index, 0), clust_index);
+  clust_offs= rec_get_offsets(clust_rec, clust_index, clust_offs,
+                              clust_index->n_core_fields, clust_pos + 1,
+                              &heap);
+  ut_ad(sec_index->n_user_defined_cols == 1);
+  const byte *clust_field= rec_get_nth_field(clust_rec, clust_offs,
+                                             clust_pos, &clust_len);
+  if (clust_len == UNIV_SQL_NULL || clust_len < GEO_DATA_HEADER_SIZE)
+  {
+    ut_ad("corrupted geometry column" == 0);
+err_exit:
+    mem_heap_free(heap);
+    return DB_SUCCESS;
+  }
+
+  /* For externally stored field, we need to get full
+  geo data to generate the MBR for comparing. */
+  if (rec_offs_nth_extern(clust_offs, clust_pos))
+  {
+    clust_field= btr_copy_externally_stored_field(
+      &clust_len, clust_field, sec_index->table->space->zip_size(),
+      clust_len, heap);
+    if (clust_field == NULL)
+    {
+      ut_ad("corrupted geometry blob" == 0);
+      goto err_exit;
+    }
+  }
+
+  ut_ad(clust_len >= GEO_DATA_HEADER_SIZE);
+  rtr_mbr_t tmp_mbr;
+  rtr_mbr_t sec_mbr;
+
+  rtree_mbr_from_wkb(
+    clust_field + GEO_DATA_HEADER_SIZE,
+    static_cast<uint>(clust_len - GEO_DATA_HEADER_SIZE),
+    SPDIMS, reinterpret_cast<double*>(&tmp_mbr));
+
+  rtr_read_mbr(sec_rec, &sec_mbr);
+
+  mem_heap_free(heap);
+  return MBR_EQUAL_CMP(&sec_mbr, &tmp_mbr)
+         ? DB_SUCCESS_LOCKED_REC
+         : DB_SUCCESS;
+}
+
+/** Returns TRUE if the user-defined column values in a secondary index record
+are alphabetically the same as the corresponding columns in the clustered
+index record.
+NOTE: the comparison is NOT done as a binary comparison, but character
+fields are compared with collation!
+@param[in]	sec_rec		secondary index record
+@param[in]	sec_index	secondary index
+@param[in]	clust_rec	clustered index record;
+				must be protected by a page s-latch
+@param[in]	clust_index	clustered index
+@param[in]	thr		query thread
+@retval	DB_COMPUTE_VALUE_FAILED in case of virtual column value computation
+	failure.
+@retval DB_SUCCESS_LOCKED_REC if the secondary record is equal to the
+	corresponding fields in the clustered record, when compared with
+	collation;
+@retval DB_SUCCESS if not equal or if the clustered record has been marked
+	for deletion */
+static
+dberr_t
+row_sel_sec_rec_is_for_clust_rec(
+	const rec_t*	sec_rec,
+	dict_index_t*	sec_index,
+	const rec_t*	clust_rec,
+	dict_index_t*	clust_index,
+	que_thr_t*	thr)
+{
+	if (rec_get_deleted_flag(clust_rec,
+				 dict_table_is_comp(clust_index->table))) {
+		/* In delete-marked records, DB_TRX_ID must
+		always refer to an existing undo log record. */
+		ut_ad(rec_get_trx_id(clust_rec, clust_index));
+
+		/* The clustered index record is delete-marked;
+		it is not visible in the read view.  Besides,
+		if there are any externally stored columns,
+		some of them may have already been purged. */
+		return DB_SUCCESS;
+	}
+
+	if (dict_index_is_spatial(sec_index)) {
+		return row_sel_spatial_sec_rec_is_for_clust_rec(
+				sec_rec, sec_index, clust_rec,
+				clust_index);
+	}
+
+	const byte*	sec_field;
+	ulint		sec_len;
+	const byte*	clust_field;
+	ulint		n;
+	ulint		i;
+	mem_heap_t*	heap		= mem_heap_create(256);
+	rec_offs	clust_offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs	sec_offsets_[REC_OFFS_SMALL_SIZE];
+	rec_offs*	clust_offs	= clust_offsets_;
+	rec_offs*	sec_offs	= sec_offsets_;
+
+	rec_offs_init(clust_offsets_);
+	rec_offs_init(sec_offsets_);
+
+	ib_vcol_row vc(heap);
+
+	clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
+				     clust_index->n_core_fields,
+				     ULINT_UNDEFINED, &heap);
+	sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs,
+				   sec_index->n_fields,
+				   ULINT_UNDEFINED, &heap);
+
+	n = dict_index_get_n_ordering_defined_by_user(sec_index);
+
+	for (i = 0; i < n; i++) {
+		const dict_field_t*	ifield;
+		const dict_col_t*	col;
+		ulint			clust_pos = 0;
+		ulint			clust_len = 0;
+		ulint			len;
+
+		ifield = dict_index_get_nth_field(sec_index, i);
+		col = dict_field_get_col(ifield);
+
+		sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len);
+
+		const bool is_virtual = col->is_virtual();
+
+		/* For virtual column, its value will need to be
+		reconstructed from base column in cluster index */
+		if (is_virtual) {
+			const dict_v_col_t*	v_col;
+			dfield_t*		vfield;
+			row_ext_t*		ext;
+
+			byte *record = vc.record(thr_get_trx(thr)->mysql_thd,
+						 clust_index,
+						 &thr->prebuilt->m_mysql_table);
+
+			v_col = reinterpret_cast<const dict_v_col_t*>(col);
+
+			dtuple_t* row = row_build(
+				ROW_COPY_POINTERS,
+				clust_index, clust_rec,
+				clust_offs,
+				NULL, NULL, NULL, &ext, heap);
+
+			vfield = innobase_get_computed_value(
+					row, v_col, clust_index,
+					&heap, NULL, NULL,
+					thr_get_trx(thr)->mysql_thd,
+					thr->prebuilt->m_mysql_table,
+					record, NULL, NULL,
+					true);
+
+			if (vfield == NULL) {
+				innobase_report_computed_value_failed(row);
+				return DB_COMPUTE_VALUE_FAILED;
+			}
+			len = clust_len = vfield->len;
+			clust_field = static_cast<byte*>(vfield->data);
+		} else {
+			clust_pos = dict_col_get_clust_pos(col, clust_index);
+
+			clust_field = rec_get_nth_cfield(
+				clust_rec, clust_index, clust_offs,
+				clust_pos, &clust_len);
+			if (clust_len == UNIV_SQL_NULL) {
+				if (sec_len == UNIV_SQL_NULL) {
+					continue;
+				}
+				return DB_SUCCESS;
+			}
+			if (sec_len == UNIV_SQL_NULL) {
+				return DB_SUCCESS;
+			}
+
+			len = clust_len;
+			ulint prefix_len = ifield->prefix_len;
+			if (rec_offs_nth_extern(clust_offs, clust_pos)) {
+				/* BLOB can contain prefix. */
+				len -= BTR_EXTERN_FIELD_REF_SIZE;
+				if (!len) {
+					goto compare_blobs;
+				}
+			}
+
+			if (prefix_len) {
+				len = dtype_get_at_most_n_mbchars(
+					col->prtype, col->mbminlen,
+					col->mbmaxlen, prefix_len, len,
+					reinterpret_cast<const char*>(
+						clust_field));
+				if (len < sec_len) {
+					goto check_for_blob;
+				}
+			} else {
+check_for_blob:
+				if (rec_offs_nth_extern(clust_offs,
+							clust_pos)) {
+compare_blobs:
+					if (!row_sel_sec_rec_is_for_blob(
+						    col->mtype, col->prtype,
+						    col->mbminlen,
+						    col->mbmaxlen,
+						    clust_field, clust_len,
+						    sec_field, sec_len,
+						    prefix_len,
+						    clust_index->table)) {
+						return DB_SUCCESS;
+					}
+
+					continue;
+				}
+			}
+		}
+
+		if (cmp_data(col->mtype, col->prtype, false,
+			     clust_field, len, sec_field, sec_len)) {
+			return DB_SUCCESS;
+		}
+	}
+
+	return DB_SUCCESS_LOCKED_REC;
+}
+
+/*********************************************************************//**
+Creates a select node struct.
+@return own: select node struct */
+sel_node_t*
+sel_node_create(
+/*============*/
+	mem_heap_t*	heap)	/*!< in: memory heap where created */
+{
+	sel_node_t*	node;
+
+	node = static_cast<sel_node_t*>(
+		mem_heap_alloc(heap, sizeof(sel_node_t)));
+
+	node->common.type = QUE_NODE_SELECT;
+	node->state = SEL_NODE_OPEN;
+
+	node->plans = NULL;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Frees the memory private to a select node when a query graph is freed,
+does not free the heap where the node was originally created. */
+void
+sel_node_free_private(
+/*==================*/
+	sel_node_t*	node)	/*!< in: select node struct */
+{
+	ulint	i;
+	plan_t*	plan;
+
+	if (node->plans != NULL) {
+		for (i = 0; i < node->n_tables; i++) {
+			plan = sel_node_get_nth_plan(node, i);
+
+			btr_pcur_close(&(plan->pcur));
+			btr_pcur_close(&(plan->clust_pcur));
+
+			if (plan->old_vers_heap) {
+				mem_heap_free(plan->old_vers_heap);
+			}
+		}
+	}
+}
+
+/*********************************************************************//**
+Evaluates the values in a select list. If there are aggregate functions,
+their argument value is added to the aggregate total. */
+UNIV_INLINE
+void
+sel_eval_select_list(
+/*=================*/
+	sel_node_t*	node)	/*!< in: select node */
+{
+	que_node_t*	exp;
+
+	exp = node->select_list;
+
+	while (exp) {
+		eval_exp(exp);
+
+		exp = que_node_get_next(exp);
+	}
+}
+
+/*********************************************************************//**
+Assigns the values in the select list to the possible into-variables in
+SELECT ... INTO ... */
+UNIV_INLINE
+void
+sel_assign_into_var_values(
+/*=======================*/
+	sym_node_t*	var,	/*!< in: first variable in a list of
+				variables */
+	sel_node_t*	node)	/*!< in: select node */
+{
+	que_node_t*	exp;
+
+	if (var == NULL) {
+
+		return;
+	}
+
+	for (exp = node->select_list;
+	     var != 0;
+	     var = static_cast<sym_node_t*>(que_node_get_next(var))) {
+
+		ut_ad(exp);
+
+		eval_node_copy_val(var->alias, exp);
+
+		exp = que_node_get_next(exp);
+	}
+}
+
+/*********************************************************************//**
+Resets the aggregate value totals in the select list of an aggregate type
+query. */
+UNIV_INLINE
+void
+sel_reset_aggregate_vals(
+/*=====================*/
+	sel_node_t*	node)	/*!< in: select node */
+{
+	func_node_t*	func_node;
+
+	ut_ad(node->is_aggregate);
+
+	for (func_node = static_cast<func_node_t*>(node->select_list);
+	     func_node != 0;
+	     func_node = static_cast<func_node_t*>(
+		     	que_node_get_next(func_node))) {
+
+		eval_node_set_int_val(func_node, 0);
+	}
+
+	node->aggregate_already_fetched = FALSE;
+}
+
+/*********************************************************************//**
+Copies the input variable values when an explicit cursor is opened. */
+UNIV_INLINE
+void
+row_sel_copy_input_variable_vals(
+/*=============================*/
+	sel_node_t*	node)	/*!< in: select node */
+{
+	sym_node_t*	var;
+
+	var = UT_LIST_GET_FIRST(node->copy_variables);
+
+	while (var) {
+		eval_node_copy_val(var, var->alias);
+
+		var->indirection = NULL;
+
+		var = UT_LIST_GET_NEXT(col_var_list, var);
+	}
+}
+
+/*********************************************************************//**
+Fetches the column values from a record. */
+static
+void
+row_sel_fetch_columns(
+/*==================*/
+	dict_index_t*	index,	/*!< in: record index */
+	const rec_t*	rec,	/*!< in: record in a clustered or non-clustered
+				index; must be protected by a page latch */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	sym_node_t*	column)	/*!< in: first column in a column list, or
+				NULL */
+{
+	dfield_t*	val;
+	ulint		index_type;
+	ulint		field_no;
+	const byte*	data;
+	ulint		len;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (dict_index_is_clust(index)) {
+		index_type = SYM_CLUST_FIELD_NO;
+	} else {
+		index_type = SYM_SEC_FIELD_NO;
+	}
+
+	while (column) {
+		mem_heap_t*	heap = NULL;
+		ibool		needs_copy;
+
+		field_no = column->field_nos[index_type];
+
+		if (field_no != ULINT_UNDEFINED) {
+
+			if (UNIV_UNLIKELY(rec_offs_nth_extern(
+						  offsets, field_no) != 0)) {
+
+				/* Copy an externally stored field to the
+				temporary heap, if possible. */
+
+				heap = mem_heap_create(1);
+
+				data = btr_rec_copy_externally_stored_field(
+					rec, offsets,
+					index->table->space->zip_size(),
+					field_no, &len, heap);
+
+				/* data == NULL means that the
+				externally stored field was not
+				written yet. This record
+				should only be seen by
+				trx_rollback_recovered() or any
+				TRX_ISO_READ_UNCOMMITTED
+				transactions. The InnoDB SQL parser
+				(the sole caller of this function)
+				does not implement READ UNCOMMITTED,
+				and it is not involved during rollback. */
+				ut_a(data);
+				ut_a(len != UNIV_SQL_NULL);
+
+				needs_copy = TRUE;
+			} else {
+				data = rec_get_nth_cfield(rec, index, offsets,
+							  field_no, &len);
+				needs_copy = column->copy_val;
+			}
+
+			if (needs_copy) {
+				eval_node_copy_and_alloc_val(column, data,
+							     len);
+			} else {
+				val = que_node_get_val(column);
+				dfield_set_data(val, data, len);
+			}
+
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+		}
+
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+}
+
+/*********************************************************************//**
+Allocates a prefetch buffer for a column when prefetch is first time done. */
+static
+void
+sel_col_prefetch_buf_alloc(
+/*=======================*/
+	sym_node_t*	column)	/*!< in: symbol table node for a column */
+{
+	sel_buf_t*	sel_buf;
+	ulint		i;
+
+	ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
+
+	column->prefetch_buf = static_cast<sel_buf_t*>(
+		ut_malloc_nokey(SEL_MAX_N_PREFETCH * sizeof(sel_buf_t)));
+
+	for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+		sel_buf = column->prefetch_buf + i;
+
+		sel_buf->data = NULL;
+		sel_buf->len = 0;
+		sel_buf->val_buf_size = 0;
+	}
+}
+
+/*********************************************************************//**
+Frees a prefetch buffer for a column, including the dynamically allocated
+memory for data stored there. */
+void
+sel_col_prefetch_buf_free(
+/*======================*/
+	sel_buf_t*	prefetch_buf)	/*!< in, own: prefetch buffer */
+{
+	sel_buf_t*	sel_buf;
+	ulint		i;
+
+	for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+		sel_buf = prefetch_buf + i;
+
+		if (sel_buf->val_buf_size > 0) {
+
+			ut_free(sel_buf->data);
+		}
+	}
+
+	ut_free(prefetch_buf);
+}
+
+/*********************************************************************//**
+Pops the column values for a prefetched, cached row from the column prefetch
+buffers and places them to the val fields in the column nodes. */
+static
+void
+sel_dequeue_prefetched_row(
+/*=======================*/
+	plan_t*	plan)	/*!< in: plan node for a table */
+{
+	sym_node_t*	column;
+	sel_buf_t*	sel_buf;
+	dfield_t*	val;
+	byte*		data;
+	ulint		len;
+	ulint		val_buf_size;
+
+	ut_ad(plan->n_rows_prefetched > 0);
+
+	column = UT_LIST_GET_FIRST(plan->columns);
+
+	while (column) {
+		val = que_node_get_val(column);
+
+		if (!column->copy_val) {
+			/* We did not really push any value for the
+			column */
+
+			ut_ad(!column->prefetch_buf);
+			ut_ad(que_node_get_val_buf_size(column) == 0);
+			ut_d(dfield_set_null(val));
+
+			goto next_col;
+		}
+
+		ut_ad(column->prefetch_buf);
+		ut_ad(!dfield_is_ext(val));
+
+		sel_buf = column->prefetch_buf + plan->first_prefetched;
+
+		data = sel_buf->data;
+		len = sel_buf->len;
+		val_buf_size = sel_buf->val_buf_size;
+
+		/* We must keep track of the allocated memory for
+		column values to be able to free it later: therefore
+		we swap the values for sel_buf and val */
+
+		sel_buf->data = static_cast<byte*>(dfield_get_data(val));
+		sel_buf->len = dfield_get_len(val);
+		sel_buf->val_buf_size = que_node_get_val_buf_size(column);
+
+		dfield_set_data(val, data, len);
+		que_node_set_val_buf_size(column, val_buf_size);
+next_col:
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+
+	plan->n_rows_prefetched--;
+
+	plan->first_prefetched++;
+}
+
+/*********************************************************************//**
+Pushes the column values for a prefetched, cached row to the column prefetch
+buffers from the val fields in the column nodes. */
+UNIV_INLINE
+void
+sel_enqueue_prefetched_row(
+/*=======================*/
+	plan_t*	plan)	/*!< in: plan node for a table */
+{
+	sym_node_t*	column;
+	sel_buf_t*	sel_buf;
+	dfield_t*	val;
+	byte*		data;
+	ulint		len;
+	ulint		pos;
+	ulint		val_buf_size;
+
+	if (plan->n_rows_prefetched == 0) {
+		pos = 0;
+		plan->first_prefetched = 0;
+	} else {
+		pos = plan->n_rows_prefetched;
+
+		/* We have the convention that pushing new rows starts only
+		after the prefetch stack has been emptied: */
+
+		ut_ad(plan->first_prefetched == 0);
+	}
+
+	plan->n_rows_prefetched++;
+
+	ut_ad(pos < SEL_MAX_N_PREFETCH);
+
+	for (column = UT_LIST_GET_FIRST(plan->columns);
+	     column != 0;
+	     column = UT_LIST_GET_NEXT(col_var_list, column)) {
+
+		if (!column->copy_val) {
+			/* There is no sense to push pointers to database
+			page fields when we do not keep latch on the page! */
+			continue;
+		}
+
+		if (!column->prefetch_buf) {
+			/* Allocate a new prefetch buffer */
+
+			sel_col_prefetch_buf_alloc(column);
+		}
+
+		sel_buf = column->prefetch_buf + pos;
+
+		val = que_node_get_val(column);
+
+		data = static_cast<byte*>(dfield_get_data(val));
+		len = dfield_get_len(val);
+		val_buf_size = que_node_get_val_buf_size(column);
+
+		/* We must keep track of the allocated memory for
+		column values to be able to free it later: therefore
+		we swap the values for sel_buf and val */
+
+		dfield_set_data(val, sel_buf->data, sel_buf->len);
+		que_node_set_val_buf_size(column, sel_buf->val_buf_size);
+
+		sel_buf->data = data;
+		sel_buf->len = len;
+		sel_buf->val_buf_size = val_buf_size;
+	}
+}
+
+/*********************************************************************//**
+Builds a previous version of a clustered index record for a consistent read
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_sel_build_prev_vers(
+/*====================*/
+	ReadView*	read_view,	/*!< in: read view */
+	dict_index_t*	index,		/*!< in: plan node for table */
+	rec_t*		rec,		/*!< in: record in a clustered index */
+	rec_offs**	offsets,	/*!< in/out: offsets returned by
+					rec_get_offsets(rec, plan->index) */
+	mem_heap_t**	offset_heap,	/*!< in/out: memory heap from which
+					the offsets are allocated */
+	mem_heap_t**    old_vers_heap,  /*!< out: old version heap to use */
+	rec_t**		old_vers,	/*!< out: old version, or NULL if the
+					record does not exist in the view:
+					i.e., it was freshly inserted
+					afterwards */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	dberr_t	err;
+
+	if (*old_vers_heap) {
+		mem_heap_empty(*old_vers_heap);
+	} else {
+		*old_vers_heap = mem_heap_create(512);
+	}
+
+	err = row_vers_build_for_consistent_read(
+		rec, mtr, index, offsets, read_view, offset_heap,
+		*old_vers_heap, old_vers, NULL);
+	return(err);
+}
+
+/*********************************************************************//**
+Builds the last committed version of a clustered index record for a
+semi-consistent read. */
+static
+void
+row_sel_build_committed_vers_for_mysql(
+/*===================================*/
+	dict_index_t*	clust_index,	/*!< in: clustered index */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct */
+	const rec_t*	rec,		/*!< in: record in a clustered index */
+	rec_offs**	offsets,	/*!< in/out: offsets returned by
+					rec_get_offsets(rec, clust_index) */
+	mem_heap_t**	offset_heap,	/*!< in/out: memory heap from which
+					the offsets are allocated */
+	const rec_t**	old_vers,	/*!< out: old version, or NULL if the
+					record does not exist in the view:
+					i.e., it was freshly inserted
+					afterwards */
+	dtuple_t**	vrow,		/*!< out: to be filled with old virtual
+					column version if any */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	if (prebuilt->old_vers_heap) {
+		mem_heap_empty(prebuilt->old_vers_heap);
+	} else {
+		prebuilt->old_vers_heap = mem_heap_create(
+			rec_offs_size(*offsets));
+	}
+
+	row_vers_build_for_semi_consistent_read(prebuilt->trx,
+		rec, mtr, clust_index, offsets, offset_heap,
+		prebuilt->old_vers_heap, old_vers, vrow);
+}
+
+/*********************************************************************//**
+Tests the conditions which determine when the index segment we are searching
+through has been exhausted.
+@return TRUE if row passed the tests */
+UNIV_INLINE
+ibool
+row_sel_test_end_conds(
+/*===================*/
+	plan_t*	plan)	/*!< in: plan for the table; the column values must
+			already have been retrieved and the right sides of
+			comparisons evaluated */
+{
+	func_node_t*	cond;
+
+	/* All conditions in end_conds are comparisons of a column to an
+	expression */
+
+	for (cond = UT_LIST_GET_FIRST(plan->end_conds);
+	     cond != 0;
+	     cond = UT_LIST_GET_NEXT(cond_list, cond)) {
+
+		/* Evaluate the left side of the comparison, i.e., get the
+		column value if there is an indirection */
+
+		eval_sym(static_cast<sym_node_t*>(cond->args));
+
+		/* Do the comparison */
+
+		if (!eval_cmp(cond)) {
+
+			return(FALSE);
+		}
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Tests the other conditions.
+@return TRUE if row passed the tests */
+UNIV_INLINE
+ibool
+row_sel_test_other_conds(
+/*=====================*/
+	plan_t*	plan)	/*!< in: plan for the table; the column values must
+			already have been retrieved */
+{
+	func_node_t*	cond;
+
+	cond = UT_LIST_GET_FIRST(plan->other_conds);
+
+	while (cond) {
+		eval_exp(cond);
+
+		if (!eval_node_get_ibool_val(cond)) {
+
+			return(FALSE);
+		}
+
+		cond = UT_LIST_GET_NEXT(cond_list, cond);
+	}
+
+	return(TRUE);
+}
+
+/** Check that a clustered index record is visible in a consistent read view.
+@param rec      clustered index record (in leaf page, or in memory)
+@param index    clustered index
+@param offsets  rec_get_offsets(rec, index)
+@param view     consistent read view
+@retval DB_SUCCESS             if rec is visible in view
+@retval DB_SUCCESS_LOCKED_REC  if rec is not visible in view
+@retval DB_CORRUPTION          if the DB_TRX_ID is corrupted */
+static dberr_t row_sel_clust_sees(const rec_t *rec, const dict_index_t &index,
+                                  const rec_offs *offsets,
+                                  const ReadView &view)
+{
+  ut_ad(index.is_primary());
+  ut_ad(page_rec_is_user_rec(rec));
+  ut_ad(rec_offs_validate(rec, &index, offsets));
+  ut_ad(!rec_is_metadata(rec, index));
+  ut_ad(!index.table->is_temporary());
+
+  const trx_id_t id= row_get_rec_trx_id(rec, &index, offsets);
+
+  if (view.changes_visible(id))
+    return DB_SUCCESS;
+  if (UNIV_LIKELY(id < view.low_limit_id() || id < trx_sys.get_max_trx_id()))
+    return DB_SUCCESS_LOCKED_REC;
+
+  ib::warn() << "A transaction id in a record of table " << index.table->name
+             << " is newer than the system-wide maximum.";
+  return DB_CORRUPTION;
+}
+
+/*********************************************************************//**
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_sel_get_clust_rec(
+/*==================*/
+	sel_node_t*	node,	/*!< in: select_node */
+	plan_t*		plan,	/*!< in: plan node for table */
+	rec_t*		rec,	/*!< in: record in a non-clustered index */
+	que_thr_t*	thr,	/*!< in: query thread */
+	rec_t**		out_rec,/*!< out: clustered record or an old version of
+				it, NULL if the old version did not exist
+				in the read view, i.e., it was a fresh
+				inserted version */
+	mtr_t*		mtr)	/*!< in: mtr used to get access to the
+				non-clustered record; the same mtr is used to
+				access the clustered index */
+{
+	dict_index_t*	index;
+	rec_t*		clust_rec;
+	rec_t*		old_vers;
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	*out_rec = NULL;
+
+	offsets = rec_get_offsets(rec, plan->pcur.index(), offsets,
+				  plan->pcur.index()->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+
+	row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
+
+	index = dict_table_get_first_index(plan->table);
+	plan->clust_pcur.old_rec = nullptr;
+	plan->clust_pcur.btr_cur.page_cur.index = index;
+	dberr_t err = btr_pcur_open_with_no_init(plan->clust_ref,
+						 PAGE_CUR_LE, BTR_SEARCH_LEAF,
+						 &plan->clust_pcur, mtr);
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		goto err_exit;
+	}
+
+	clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
+
+	/* Note: only if the search ends up on a non-infimum record is the
+	low_match value the real match to the search tuple */
+
+	if (!page_rec_is_user_rec(clust_rec)
+	    || btr_pcur_get_low_match(&(plan->clust_pcur))
+	    < dict_index_get_n_unique(index)) {
+
+		if (!node->read_view ||
+		    !rec_get_deleted_flag(rec, plan->table->not_redundant())) {
+			err = DB_CORRUPTION;
+		}
+
+		/* In a rare case it is possible that no clust rec is found
+		for a delete-marked secondary index record: if in row0umod.cc
+		in row_undo_mod_remove_clust_low() we have already removed
+		the clust rec, while purge is still cleaning and removing
+		secondary index records associated with earlier versions of
+		the clustered index record. In that case we know that the
+		clustered index record did not exist in the read view of
+		trx. */
+
+		goto err_exit;
+	}
+
+	offsets = rec_get_offsets(clust_rec, index, offsets,
+				  index->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+
+	if (!node->read_view) {
+		/* Try to place a lock on the index record */
+		trx_t* trx = thr_get_trx(thr);
+
+		/* At READ UNCOMMITTED or READ COMMITTED isolation level
+		we lock only the record, i.e., next-key locking is
+		not used. */
+		err = lock_clust_rec_read_check_and_lock(
+			0, btr_pcur_get_block(&plan->clust_pcur),
+			clust_rec, index, offsets,
+			node->row_lock_mode,
+			trx->isolation_level <= TRX_ISO_READ_COMMITTED
+			? LOCK_REC_NOT_GAP : LOCK_ORDINARY,
+			thr);
+
+		switch (err) {
+		case DB_SUCCESS:
+		case DB_SUCCESS_LOCKED_REC:
+			/* Declare the variable uninitialized.
+			It should be set to DB_SUCCESS at func_exit. */
+			MEM_UNDEFINED(&err, sizeof err);
+			break;
+		default:
+			goto err_exit;
+		}
+	} else {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		old_vers = NULL;
+
+		err = row_sel_clust_sees(clust_rec, *index, offsets,
+                                         *node->read_view);
+
+		switch (err) {
+		default:
+			goto err_exit;
+		case DB_SUCCESS:
+			break;
+		case DB_SUCCESS_LOCKED_REC:
+			err = row_sel_build_prev_vers(
+				node->read_view, index, clust_rec,
+				&offsets, &heap, &plan->old_vers_heap,
+				&old_vers, mtr);
+
+			if (err != DB_SUCCESS) {
+
+				goto err_exit;
+			}
+
+			clust_rec = old_vers;
+
+			if (clust_rec == NULL) {
+				goto err_exit;
+			}
+		}
+
+		/* If we had to go to an earlier version of row or the
+		secondary index record is delete marked, then it may be that
+		the secondary index record corresponding to clust_rec
+		(or old_vers) is not rec; in that case we must ignore
+		such row because in our snapshot rec would not have existed.
+		Remember that from rec we cannot see directly which transaction
+		id corresponds to it: we have to go to the clustered index
+		record. A query where we want to fetch all rows where
+		the secondary index value is in some interval would return
+		a wrong result if we would not drop rows which we come to
+		visit through secondary index records that would not really
+		exist in our snapshot. */
+
+		if (old_vers || rec_get_deleted_flag(rec, dict_table_is_comp(
+							       plan->table))) {
+			err = row_sel_sec_rec_is_for_clust_rec(rec,
+							plan->index, clust_rec,
+							index, thr);
+			if (err != DB_SUCCESS_LOCKED_REC) {
+				goto err_exit;
+			}
+		}
+	}
+
+	/* Fetch the columns needed in test conditions.  The clustered
+	index record is protected by a page latch that was acquired
+	when plan->clust_pcur was positioned.  The latch will not be
+	released until mtr->commit(). */
+
+	ut_ad(!rec_get_deleted_flag(clust_rec, rec_offs_comp(offsets)));
+	row_sel_fetch_columns(index, clust_rec, offsets,
+			      UT_LIST_GET_FIRST(plan->columns));
+	*out_rec = clust_rec;
+	err = DB_SUCCESS;
+err_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/*********************************************************************//**
+Sets a lock on a page of R-Tree record. This is all or none action,
+mostly due to we cannot reposition a record in R-Tree (with the
+nature of splitting)
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+UNIV_INLINE
+dberr_t
+sel_set_rtr_rec_lock(
+/*=================*/
+	btr_pcur_t*		pcur,	/*!< in: cursor */
+	const rec_t*		first_rec,/*!< in: record */
+	dict_index_t*		index,	/*!< in: index */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	unsigned		mode,	/*!< in: lock mode */
+	unsigned		type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOC_REC_NOT_GAP */
+	que_thr_t*		thr,	/*!< in: query thread */
+	mtr_t*			mtr)	/*!< in: mtr */
+{
+	matched_rec_t*  match = pcur->btr_cur.rtr_info->matches;
+	mem_heap_t*     heap = NULL;
+	dberr_t		err = DB_SUCCESS;
+	trx_t*		trx = thr_get_trx(thr);
+	buf_block_t*	cur_block = btr_pcur_get_block(pcur);
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	my_offsets = const_cast<rec_offs*>(offsets);
+	rec_t*		rec = const_cast<rec_t*>(first_rec);
+	rtr_rec_vector*	match_rec;
+	rtr_rec_vector::iterator end;
+
+	rec_offs_init(offsets_);
+
+	if (match->locked || page_rec_is_supremum(first_rec)) {
+		return(DB_SUCCESS_LOCKED_REC);
+	}
+
+	ut_ad(page_align(first_rec) == cur_block->page.frame);
+	ut_ad(match->valid);
+
+	match->block.page.lock.x_lock();
+retry:
+	cur_block = btr_pcur_get_block(pcur);
+	ut_ad(match->block.page.lock.have_x()
+	      || match->block.page.lock.have_s());
+	ut_ad(page_is_leaf(cur_block->page.frame));
+
+	err = lock_sec_rec_read_check_and_lock(
+		0, cur_block, rec, index, my_offsets,
+		static_cast<lock_mode>(mode), type, thr);
+
+	if (err == DB_LOCK_WAIT) {
+re_scan:
+		mtr->commit();
+		trx->error_state = err;
+		thr->lock_state = QUE_THR_LOCK_ROW;
+		if (row_mysql_handle_errors(
+			&err, trx, thr, NULL)) {
+			thr->lock_state = QUE_THR_LOCK_NOLOCK;
+			mtr->start();
+
+			mysql_mutex_lock(&match->rtr_match_mutex);
+			if (!match->valid && match->matched_recs->empty()) {
+				mysql_mutex_unlock(&match->rtr_match_mutex);
+				err = DB_RECORD_NOT_FOUND;
+				goto func_end;
+			}
+			mysql_mutex_unlock(&match->rtr_match_mutex);
+
+			/* MDEV-14059 FIXME: why re-latch the block?
+			pcur is already positioned on it! */
+			cur_block = buf_page_get_gen(
+				btr_pcur_get_block(pcur)->page.id(),
+				btr_pcur_get_block(pcur)->zip_size(),
+				RW_X_LATCH, NULL, BUF_GET, mtr, &err);
+			if (!cur_block) {
+				goto func_end;
+			}
+		} else {
+			mtr->start();
+			goto func_end;
+		}
+
+		DEBUG_SYNC_C("rtr_set_lock_wait");
+
+		if (!match->valid) {
+			/* Page got deleted */
+			mtr->commit();
+			mtr->start();
+			err = DB_RECORD_NOT_FOUND;
+			goto func_end;
+		}
+
+		match->matched_recs->clear();
+		// FIXME: check for !cur_block
+
+		rtr_cur_search_with_match(
+			cur_block, index,
+			pcur->btr_cur.rtr_info->search_tuple,
+			pcur->btr_cur.rtr_info->search_mode,
+			&pcur->btr_cur.page_cur,
+			pcur->btr_cur.rtr_info);
+
+		if (!page_is_leaf(buf_block_get_frame(cur_block))) {
+			/* Page got splitted and promoted (only for
+			root page it is possible).  Release the
+			page and ask for a re-search */
+			mtr->commit();
+			mtr->start();
+			err = DB_RECORD_NOT_FOUND;
+			goto func_end;
+		}
+
+		rec = btr_pcur_get_rec(pcur);
+		my_offsets = offsets_;
+		my_offsets = rec_get_offsets(rec, index, my_offsets,
+					     index->n_fields,
+					     ULINT_UNDEFINED, &heap);
+
+		/* No match record */
+		if (page_rec_is_supremum(rec) || !match->valid) {
+			mtr->commit();
+			mtr->start();
+			err = DB_RECORD_NOT_FOUND;
+			goto func_end;
+		}
+
+		goto retry;
+	}
+
+	my_offsets = offsets_;
+	match_rec = match->matched_recs;
+	end = match_rec->end();
+
+	for (rtr_rec_vector::iterator it = match_rec->begin();
+	     it != end; ++it) {
+		rtr_rec_t*	rtr_rec = &(*it);
+
+		my_offsets = rec_get_offsets(
+			rtr_rec->r_rec, index, my_offsets, index->n_fields,
+			ULINT_UNDEFINED, &heap);
+
+		err = lock_sec_rec_read_check_and_lock(
+			0, &match->block, rtr_rec->r_rec, index,
+			my_offsets, static_cast<lock_mode>(mode),
+			type, thr);
+
+		if (err == DB_SUCCESS || err == DB_SUCCESS_LOCKED_REC) {
+			rtr_rec->locked = true;
+		} else if (err == DB_LOCK_WAIT) {
+			goto re_scan;
+		} else {
+			goto func_end;
+		}
+	}
+
+	match->locked = true;
+
+func_end:
+	match->block.page.lock.x_unlock();
+	if (heap != NULL) {
+		mem_heap_free(heap);
+	}
+
+	ut_ad(err != DB_LOCK_WAIT);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Sets a lock on a record.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+UNIV_INLINE
+dberr_t
+sel_set_rec_lock(
+/*=============*/
+	btr_pcur_t*		pcur,	/*!< in: cursor */
+	const rec_t*		rec,	/*!< in: record */
+	dict_index_t*		index,	/*!< in: index */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	unsigned		mode,	/*!< in: lock mode */
+	unsigned		type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOC_REC_NOT_GAP */
+	que_thr_t*		thr,	/*!< in: query thread */
+	mtr_t*			mtr)	/*!< in: mtr */
+{
+	trx_t*			trx;
+	dberr_t			err = DB_SUCCESS;
+	const buf_block_t*	block;
+
+	block = btr_pcur_get_block(pcur);
+
+	trx = thr_get_trx(thr);
+
+	if (UT_LIST_GET_LEN(trx->lock.trx_locks) > 10000
+	    && buf_pool.running_out()) {
+		return DB_LOCK_TABLE_FULL;
+	}
+
+	if (dict_index_is_clust(index)) {
+		err = lock_clust_rec_read_check_and_lock(
+			0, block, rec, index, offsets,
+			static_cast<lock_mode>(mode), type, thr);
+	} else {
+
+		if (dict_index_is_spatial(index)) {
+			if (type == LOCK_GAP || type == LOCK_ORDINARY) {
+				ut_ad(0);
+				ib::error() << "Incorrectly request GAP lock "
+					"on RTree";
+				return(DB_SUCCESS);
+			}
+			err = sel_set_rtr_rec_lock(pcur, rec, index, offsets,
+						   mode, type, thr, mtr);
+		} else {
+			err = lock_sec_rec_read_check_and_lock(
+				0, block, rec, index, offsets,
+				static_cast<lock_mode>(mode), type, thr);
+		}
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Opens a pcur to a table index. */
+MY_ATTRIBUTE((warn_unused_result, nonnull))
+static
+dberr_t
+row_sel_open_pcur(
+/*==============*/
+	plan_t*		plan,	/*!< in: table plan */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	dict_index_t*	index;
+	func_node_t*	cond;
+	que_node_t*	exp;
+	ulint		n_fields;
+	ulint		i;
+
+	ut_ad(!plan->n_rows_prefetched);
+	ut_ad(!plan->n_rows_fetched);
+	ut_ad(!plan->cursor_at_end);
+
+	index = plan->index;
+
+	/* Calculate the value of the search tuple: the exact match columns
+	get their expressions evaluated when we evaluate the right sides of
+	end_conds */
+
+	cond = UT_LIST_GET_FIRST(plan->end_conds);
+
+	while (cond) {
+		eval_exp(que_node_get_next(cond->args));
+
+		cond = UT_LIST_GET_NEXT(cond_list, cond);
+	}
+
+	plan->pcur.old_rec = nullptr;
+	plan->pcur.btr_cur.page_cur.index = index;
+
+	dberr_t err;
+
+	if (plan->tuple) {
+		n_fields = dtuple_get_n_fields(plan->tuple);
+
+		if (plan->n_exact_match < n_fields) {
+			/* There is a non-exact match field which must be
+			evaluated separately */
+
+			eval_exp(plan->tuple_exps[n_fields - 1]);
+		}
+
+		for (i = 0; i < n_fields; i++) {
+			exp = plan->tuple_exps[i];
+
+			dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
+					 que_node_get_val(exp));
+		}
+
+		err = btr_pcur_open_with_no_init(plan->tuple,
+						 plan->mode, BTR_SEARCH_LEAF,
+						 &plan->pcur, mtr);
+	} else {
+		err = plan->pcur.open_leaf(plan->asc, index, BTR_SEARCH_LEAF,
+					   mtr);
+	}
+
+	plan->pcur_is_open = err == DB_SUCCESS;
+	return err;
+}
+
+/*********************************************************************//**
+Restores a stored pcur position to a table index.
+@return TRUE if the cursor should be moved to the next record after we
+return from this function (moved to the previous, in the case of a
+descending cursor) without processing again the current cursor
+record */
+static
+ibool
+row_sel_restore_pcur_pos(
+/*=====================*/
+	plan_t*		plan,	/*!< in: table plan */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ibool	equal_position;
+	ulint	relative_position;
+
+	ut_ad(!plan->cursor_at_end);
+
+	relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
+
+	equal_position =
+	  plan->pcur.restore_position(BTR_SEARCH_LEAF, mtr) ==
+	  btr_pcur_t::SAME_ALL;
+
+	/* If the cursor is traveling upwards, and relative_position is
+
+	(1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
+	yet on the successor of the page infimum;
+	(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+	first record GREATER than the predecessor of a page supremum; we have
+	not yet processed the cursor record: no need to move the cursor to the
+	next record;
+	(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+	last record LESS or EQUAL to the old stored user record; (a) if
+	equal_position is FALSE, this means that the cursor is now on a record
+	less than the old user record, and we must move to the next record;
+	(b) if equal_position is TRUE, then if
+	plan->stored_cursor_rec_processed is TRUE, we must move to the next
+	record, else there is no need to move the cursor. */
+
+	if (plan->asc) {
+		if (relative_position == BTR_PCUR_ON) {
+
+			if (equal_position) {
+
+				return(plan->stored_cursor_rec_processed);
+			}
+
+			return(TRUE);
+		}
+
+		ut_ad(relative_position == BTR_PCUR_AFTER
+		      || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
+
+		return(FALSE);
+	}
+
+	/* If the cursor is traveling downwards, and relative_position is
+
+	(1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
+	the last record LESS than the successor of a page infimum; we have not
+	processed the cursor record: no need to move the cursor;
+	(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+	first record GREATER than the predecessor of a page supremum; we have
+	processed the cursor record: we should move the cursor to the previous
+	record;
+	(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+	last record LESS or EQUAL to the old stored user record; (a) if
+	equal_position is FALSE, this means that the cursor is now on a record
+	less than the old user record, and we need not move to the previous
+	record; (b) if equal_position is TRUE, then if
+	plan->stored_cursor_rec_processed is TRUE, we must move to the previous
+	record, else there is no need to move the cursor. */
+
+	if (relative_position == BTR_PCUR_BEFORE
+	    || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
+
+		return(FALSE);
+	}
+
+	if (relative_position == BTR_PCUR_ON) {
+
+		if (equal_position) {
+
+			return(plan->stored_cursor_rec_processed);
+		}
+
+		return(FALSE);
+	}
+
+	ut_ad(relative_position == BTR_PCUR_AFTER
+	      || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Resets a plan cursor to a closed state. */
+UNIV_INLINE
+void
+plan_reset_cursor(
+/*==============*/
+	plan_t*	plan)	/*!< in: plan */
+{
+	plan->pcur_is_open = FALSE;
+	plan->cursor_at_end = FALSE;
+	plan->n_rows_fetched = 0;
+	plan->n_rows_prefetched = 0;
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/*********************************************************************//**
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always).
+@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+static
+ulint
+row_sel_try_search_shortcut(
+/*========================*/
+	sel_node_t*	node,	/*!< in: select node for a consistent read */
+	plan_t*		plan,	/*!< in: plan for a unique search in clustered
+				index */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_index_t*	index = plan->index;
+
+	ut_ad(!index->table->is_temporary());
+	ut_ad(node->read_view);
+	ut_ad(node->read_view->is_open());
+	ut_ad(plan->unique_search);
+	ut_ad(!plan->must_get_clust);
+
+	if (row_sel_open_pcur(plan, mtr) != DB_SUCCESS) {
+		return SEL_RETRY;
+	}
+
+	const rec_t* rec = btr_pcur_get_rec(&(plan->pcur));
+
+	if (!page_rec_is_user_rec(rec) || rec_is_metadata(rec, *index)) {
+		return SEL_RETRY;
+	}
+
+	ut_ad(plan->mode == PAGE_CUR_GE);
+
+	/* As the cursor is now placed on a user record after a search with
+	the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+	fields in the user record matched to the search tuple */
+
+	if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
+		return SEL_EXHAUSTED;
+	}
+
+	if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
+		/* See row_search_mvcc() for a comment on bulk_trx_id */
+		if (!node->read_view->changes_visible(bulk_trx_id)) {
+			return SEL_EXHAUSTED;
+		}
+	}
+
+	/* This is a non-locking consistent read: if necessary, fetch
+	a previous version of the record */
+
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+
+	if (dict_index_is_clust(index)) {
+		if (row_sel_clust_sees(rec, *index, offsets, *node->read_view)
+		    != DB_SUCCESS) {
+			return SEL_RETRY;
+		}
+	} else if (!srv_read_only_mode) {
+		trx_id_t trx_id = page_get_max_trx_id(page_align(rec));
+		ut_ad(trx_id);
+		if (!node->read_view->sees(trx_id)) {
+			return SEL_RETRY;
+		}
+	}
+
+	if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) {
+		return SEL_EXHAUSTED;
+	}
+
+	/* Fetch the columns needed in test conditions.  The index
+	record is protected by a page latch that was acquired when
+	plan->pcur was positioned.  The latch will not be released
+	until mtr->commit(). */
+
+	row_sel_fetch_columns(index, rec, offsets,
+			      UT_LIST_GET_FIRST(plan->columns));
+
+	/* Test the rest of search conditions */
+
+	if (!row_sel_test_other_conds(plan)) {
+		return SEL_EXHAUSTED;
+	}
+
+	ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
+
+	plan->n_rows_fetched++;
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return SEL_FOUND;
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/*********************************************************************//**
+Performs a select step.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_sel(
+/*====*/
+	sel_node_t*	node,	/*!< in: select node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dict_index_t*	index;
+	plan_t*		plan;
+	mtr_t		mtr;
+	ibool		moved;
+	rec_t*		rec;
+	rec_t*		old_vers;
+	rec_t*		clust_rec;
+
+	/* The following flag becomes TRUE when we are doing a
+	consistent read from a non-clustered index and we must look
+	at the clustered index to find out the previous delete mark
+	state of the non-clustered record: */
+
+	ibool		cons_read_requires_clust_rec	= FALSE;
+	ulint		cost_counter			= 0;
+	ibool		cursor_just_opened;
+	ibool		must_go_to_next;
+	ibool		mtr_has_extra_clust_latch	= FALSE;
+	/* TRUE if the search was made using
+	a non-clustered index, and we had to
+	access the clustered record: now &mtr
+	contains a clustered index latch, and
+	&mtr must be committed before we move
+	to the next non-clustered record */
+	dberr_t		err;
+	mem_heap_t*	heap				= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets				= offsets_;
+	rec_offs_init(offsets_);
+	const trx_t*	trx = thr_get_trx(thr);
+
+	ut_ad(thr->run_node == node);
+	ut_ad(!node->read_view || node->read_view == &trx->read_view);
+	ut_ad(!node->read_view || node->read_view->is_open());
+
+table_loop:
+	/* TABLE LOOP
+	----------
+	This is the outer major loop in calculating a join. We come here when
+	node->fetch_table changes, and after adding a row to aggregate totals
+	and, of course, when this function is called. */
+
+	ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+	plan = sel_node_get_nth_plan(node, node->fetch_table);
+	index = plan->index;
+
+	if (plan->n_rows_prefetched > 0) {
+		sel_dequeue_prefetched_row(plan);
+
+		goto next_table_no_mtr;
+	}
+
+	if (plan->cursor_at_end) {
+		/* The cursor has already reached the result set end: no more
+		rows to process for this table cursor, as also the prefetch
+		stack was empty */
+
+		ut_ad(plan->pcur_is_open);
+
+		goto table_exhausted_no_mtr;
+	}
+
+	/* Open a cursor to index, or restore an open cursor position */
+
+	mtr.start();
+
+#ifdef BTR_CUR_HASH_ADAPT
+	if (node->read_view && plan->unique_search && !plan->pcur_is_open
+	    && !plan->must_get_clust) {
+		switch (row_sel_try_search_shortcut(node, plan, &mtr)) {
+		case SEL_FOUND:
+			goto next_table;
+		case SEL_EXHAUSTED:
+			goto table_exhausted;
+		default:
+			ut_ad(0);
+			/* fall through */
+		case SEL_RETRY:
+			break;
+		}
+
+		plan_reset_cursor(plan);
+
+		mtr.commit();
+		mtr.start();
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	if (!plan->pcur_is_open) {
+		/* Evaluate the expressions to build the search tuple and
+		open the cursor */
+		err = row_sel_open_pcur(plan, &mtr);
+
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			goto mtr_commit_exit;
+		}
+
+		cursor_just_opened = TRUE;
+
+		/* A new search was made: increment the cost counter */
+		cost_counter++;
+	} else {
+		/* Restore pcur position to the index */
+
+		must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr);
+
+		cursor_just_opened = FALSE;
+
+		if (must_go_to_next) {
+			/* We have already processed the cursor record: move
+			to the next */
+
+			goto next_rec;
+		}
+	}
+
+	if (!node->read_view
+	    || trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
+	} else if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
+		/* See row_search_mvcc() for a comment on bulk_trx_id */
+		if (!trx->read_view.changes_visible(bulk_trx_id)) {
+			goto table_exhausted;
+		}
+	}
+
+rec_loop:
+	/* RECORD LOOP
+	-----------
+	In this loop we use pcur and try to fetch a qualifying row, and
+	also fill the prefetch buffer for this table if n_rows_fetched has
+	exceeded a threshold. While we are inside this loop, the following
+	holds:
+	(1) &mtr is started,
+	(2) pcur is positioned and open.
+
+	NOTE that if cursor_just_opened is TRUE here, it means that we came
+	to this point right after row_sel_open_pcur. */
+
+	ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+	rec = btr_pcur_get_rec(&(plan->pcur));
+
+	/* PHASE 1: Set a lock if specified */
+
+	if (!node->asc && cursor_just_opened
+	    && !page_rec_is_supremum(rec)) {
+
+		/* Do not support "descending search" for Spatial index */
+		ut_ad(!dict_index_is_spatial(index));
+
+		/* When we open a cursor for a descending search, we must set
+		a next-key lock on the successor record: otherwise it would
+		be possible to insert new records next to the cursor position,
+		and it might be that these new records should appear in the
+		search result set, resulting in the phantom problem. */
+
+		if (!node->read_view) {
+			const rec_t* next_rec = page_rec_get_next_const(rec);
+			if (UNIV_UNLIKELY(!next_rec)) {
+				err = DB_CORRUPTION;
+				goto lock_wait_or_error;
+			}
+			unsigned lock_type;
+
+			offsets = rec_get_offsets(next_rec, index, offsets,
+						  index->n_core_fields,
+						  ULINT_UNDEFINED, &heap);
+
+			/* At READ UNCOMMITTED or READ COMMITTED
+			isolation level, we lock only the record,
+			i.e., next-key locking is not used. */
+			if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+				if (page_rec_is_supremum(next_rec)) {
+					goto skip_lock;
+				}
+
+				lock_type = LOCK_REC_NOT_GAP;
+			} else {
+				lock_type = LOCK_ORDINARY;
+			}
+
+			err = sel_set_rec_lock(&plan->pcur,
+					       next_rec, index, offsets,
+					       node->row_lock_mode,
+					       lock_type, thr, &mtr);
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+				err = DB_SUCCESS;
+				/* fall through */
+			case DB_SUCCESS:
+				break;
+			default:
+				/* Note that in this case we will store in pcur
+				the PREDECESSOR of the record we are waiting
+				the lock for */
+				goto lock_wait_or_error;
+			}
+		}
+	}
+
+skip_lock:
+	if (page_rec_is_infimum(rec)) {
+
+		/* The infimum record on a page cannot be in the result set,
+		and neither can a record lock be placed on it: we skip such
+		a record. We also increment the cost counter as we may have
+		processed yet another page of index. */
+
+		cost_counter++;
+
+		goto next_rec;
+	}
+
+	if (rec_is_metadata(rec, *index)) {
+		/* Skip the metadata pseudo-record. */
+		cost_counter++;
+		goto next_rec;
+	}
+
+	if (!node->read_view) {
+		/* Try to place a lock on the index record */
+		unsigned lock_type;
+
+		offsets = rec_get_offsets(rec, index, offsets,
+					  index->n_core_fields,
+					  ULINT_UNDEFINED, &heap);
+
+		/* At READ UNCOMMITTED or READ COMMITTED isolation level,
+		we lock only the record, i.e., next-key locking is
+		not used. */
+		if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+		    || dict_index_is_spatial(index)) {
+
+			if (page_rec_is_supremum(rec)) {
+
+				goto next_rec;
+			}
+
+			lock_type = LOCK_REC_NOT_GAP;
+		} else {
+			lock_type = LOCK_ORDINARY;
+		}
+
+		err = sel_set_rec_lock(&plan->pcur,
+				       rec, index, offsets,
+				       node->row_lock_mode, lock_type,
+				       thr, &mtr);
+
+		switch (err) {
+		case DB_SUCCESS_LOCKED_REC:
+			err = DB_SUCCESS;
+			/* fall through */
+		case DB_SUCCESS:
+			break;
+		default:
+			goto lock_wait_or_error;
+		}
+	}
+
+	if (page_rec_is_supremum(rec)) {
+
+		/* A page supremum record cannot be in the result set: skip
+		it now when we have placed a possible lock on it */
+
+		goto next_rec;
+	}
+
+	ut_ad(page_rec_is_user_rec(rec));
+
+	if (cost_counter > SEL_COST_LIMIT) {
+
+		/* Now that we have placed the necessary locks, we can stop
+		for a while and store the cursor position; NOTE that if we
+		would store the cursor position BEFORE placing a record lock,
+		it might happen that the cursor would jump over some records
+		that another transaction could meanwhile insert adjacent to
+		the cursor: this would result in the phantom problem. */
+
+		goto stop_for_a_while;
+	}
+
+	/* PHASE 2: Check a mixed index mix id if needed */
+
+	if (plan->unique_search && cursor_just_opened) {
+
+		ut_ad(plan->mode == PAGE_CUR_GE);
+
+		/* As the cursor is now placed on a user record after a search
+		with the mode PAGE_CUR_GE, the up_match field in the cursor
+		tells how many fields in the user record matched to the search
+		tuple */
+
+		if (btr_pcur_get_up_match(&(plan->pcur))
+		    < plan->n_exact_match) {
+			goto table_exhausted;
+		}
+
+		/* Ok, no need to test end_conds or mix id */
+
+	}
+
+	/* We are ready to look at a possible new index entry in the result
+	set: the cursor is now placed on a user record */
+
+	/* PHASE 3: Get previous version in a consistent read */
+
+	cons_read_requires_clust_rec = FALSE;
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+
+	if (node->read_view) {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		if (dict_index_is_clust(index)) {
+			const trx_id_t id = row_get_rec_trx_id(
+				rec, index, offsets);
+
+			if (!node->read_view->changes_visible(id)) {
+				if (id >= node->read_view->low_limit_id()
+				    && id >= trx_sys.get_max_trx_id()) {
+					err = DB_CORRUPTION;
+					goto lock_wait_or_error;
+				}
+
+				err = row_sel_build_prev_vers(
+					node->read_view, index, rec,
+					&offsets, &heap, &plan->old_vers_heap,
+					&old_vers, &mtr);
+
+				if (err != DB_SUCCESS) {
+
+					goto lock_wait_or_error;
+				}
+
+				if (old_vers == NULL) {
+					/* The record does not exist
+					in our read view. Skip it, but
+					first attempt to determine
+					whether the index segment we
+					are searching through has been
+					exhausted. */
+
+					offsets = rec_get_offsets(
+						rec, index, offsets,
+						index->n_core_fields,
+						ULINT_UNDEFINED, &heap);
+
+					/* Fetch the columns needed in
+					test conditions. The clustered
+					index record is protected by a
+					page latch that was acquired
+					by row_sel_open_pcur() or
+					row_sel_restore_pcur_pos().
+					The latch will not be released
+					until mtr.commit(). */
+
+					row_sel_fetch_columns(
+						index, rec, offsets,
+						UT_LIST_GET_FIRST(
+							plan->columns));
+
+					if (!row_sel_test_end_conds(plan)) {
+
+						goto table_exhausted;
+					}
+
+					goto next_rec;
+				}
+
+				rec = old_vers;
+			}
+		} else if (!srv_read_only_mode) {
+			trx_id_t trx_id = page_get_max_trx_id(page_align(rec));
+			ut_ad(trx_id);
+			if (!node->read_view->sees(trx_id)) {
+				cons_read_requires_clust_rec = TRUE;
+			}
+		}
+	}
+
+	/* PHASE 4: Test search end conditions and deleted flag */
+
+	/* Fetch the columns needed in test conditions.  The record is
+	protected by a page latch that was acquired by
+	row_sel_open_pcur() or row_sel_restore_pcur_pos().  The latch
+	will not be released until mtr.commit(). */
+
+	row_sel_fetch_columns(index, rec, offsets,
+			      UT_LIST_GET_FIRST(plan->columns));
+
+	/* Test the selection end conditions: these can only contain columns
+	which already are found in the index, even though the index might be
+	non-clustered */
+
+	if (plan->unique_search && cursor_just_opened) {
+
+		/* No test necessary: the test was already made above */
+
+	} else if (!row_sel_test_end_conds(plan)) {
+
+		goto table_exhausted;
+	}
+
+	if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))
+	    && !cons_read_requires_clust_rec) {
+
+		/* The record is delete marked: we can skip it if this is
+		not a consistent read which might see an earlier version
+		of a non-clustered index record */
+
+		if (plan->unique_search) {
+
+			goto table_exhausted;
+		}
+
+		goto next_rec;
+	}
+
+	/* PHASE 5: Get the clustered index record, if needed and if we did
+	not do the search using the clustered index */
+
+	if (plan->must_get_clust || cons_read_requires_clust_rec) {
+
+		/* It was a non-clustered index and we must fetch also the
+		clustered index record */
+
+		err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
+					    &mtr);
+		mtr_has_extra_clust_latch = TRUE;
+
+		if (err != DB_SUCCESS) {
+
+			goto lock_wait_or_error;
+		}
+
+		/* Retrieving the clustered record required a search:
+		increment the cost counter */
+
+		cost_counter++;
+
+		if (clust_rec == NULL) {
+			/* The record did not exist in the read view */
+			ut_ad(node->read_view);
+
+			goto next_rec;
+		}
+
+		if (rec_get_deleted_flag(clust_rec,
+					 dict_table_is_comp(plan->table))) {
+			/* In delete-marked records, DB_TRX_ID must
+			always refer to an existing update_undo log record. */
+			ut_ad(rec_get_trx_id(clust_rec,
+					     dict_table_get_first_index(
+						     plan->table)));
+
+			/* The record is delete marked: we can skip it */
+
+			goto next_rec;
+		}
+
+		if (node->can_get_updated) {
+
+			btr_pcur_store_position(&(plan->clust_pcur), &mtr);
+		}
+	}
+
+	/* PHASE 6: Test the rest of search conditions */
+
+	if (!row_sel_test_other_conds(plan)) {
+
+		if (plan->unique_search) {
+
+			goto table_exhausted;
+		}
+
+		goto next_rec;
+	}
+
+	/* PHASE 7: We found a new qualifying row for the current table; push
+	the row if prefetch is on, or move to the next table in the join */
+
+	plan->n_rows_fetched++;
+
+	ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
+
+	if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
+	    || plan->unique_search || plan->no_prefetch) {
+
+		/* No prefetch in operation: go to the next table */
+
+		goto next_table;
+	}
+
+	sel_enqueue_prefetched_row(plan);
+
+	if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
+
+		/* The prefetch buffer is now full */
+
+		sel_dequeue_prefetched_row(plan);
+
+		goto next_table;
+	}
+
+next_rec:
+	if (mtr_has_extra_clust_latch) {
+
+		/* We must commit &mtr if we are moving to the next
+		non-clustered index record, because we could break the
+		latching order if we would access a different clustered
+		index page right away without releasing the previous. */
+
+		goto commit_mtr_for_a_while;
+	}
+
+	if (node->asc) {
+		moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
+	} else {
+		moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
+	}
+
+	if (!moved) {
+
+		goto table_exhausted;
+	}
+
+	cursor_just_opened = FALSE;
+
+	/* END OF RECORD LOOP
+	------------------ */
+	goto rec_loop;
+
+next_table:
+	/* We found a record which satisfies the conditions: we can move to
+	the next table or return a row in the result set */
+
+	ut_ad(btr_pcur_is_on_user_rec(&plan->pcur));
+
+	if (plan->unique_search && !node->can_get_updated) {
+
+		plan->cursor_at_end = TRUE;
+	} else {
+		plan->stored_cursor_rec_processed = TRUE;
+
+		btr_pcur_store_position(&(plan->pcur), &mtr);
+	}
+
+	mtr.commit();
+
+	mtr_has_extra_clust_latch = FALSE;
+
+next_table_no_mtr:
+	/* If we use 'goto' to this label, it means that the row was popped
+	from the prefetched rows stack, and &mtr is already committed */
+
+	if (node->fetch_table + 1 == node->n_tables) {
+
+		sel_eval_select_list(node);
+
+		if (node->is_aggregate) {
+
+			goto table_loop;
+		}
+
+		sel_assign_into_var_values(node->into_list, node);
+
+		thr->run_node = que_node_get_parent(node);
+
+		err = DB_SUCCESS;
+		goto func_exit;
+	}
+
+	node->fetch_table++;
+
+	/* When we move to the next table, we first reset the plan cursor:
+	we do not care about resetting it when we backtrack from a table */
+
+	plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
+
+	goto table_loop;
+
+table_exhausted:
+	/* The table cursor pcur reached the result set end: backtrack to the
+	previous table in the join if we do not have cached prefetched rows */
+
+	plan->cursor_at_end = TRUE;
+
+	mtr.commit();
+
+	mtr_has_extra_clust_latch = FALSE;
+
+	if (plan->n_rows_prefetched > 0) {
+		/* The table became exhausted during a prefetch */
+
+		sel_dequeue_prefetched_row(plan);
+
+		goto next_table_no_mtr;
+	}
+
+table_exhausted_no_mtr:
+	if (node->fetch_table == 0) {
+		err = DB_SUCCESS;
+
+		if (node->is_aggregate && !node->aggregate_already_fetched) {
+
+			node->aggregate_already_fetched = TRUE;
+
+			sel_assign_into_var_values(node->into_list, node);
+
+			thr->run_node = que_node_get_parent(node);
+		} else {
+			node->state = SEL_NODE_NO_MORE_ROWS;
+
+			thr->run_node = que_node_get_parent(node);
+		}
+
+		goto func_exit;
+	}
+
+	node->fetch_table--;
+
+	goto table_loop;
+
+stop_for_a_while:
+	/* Return control for a while to que_run_threads, so that runaway
+	queries can be canceled. NOTE that when we come here, we must, in a
+	locking read, have placed the necessary (possibly waiting request)
+	record lock on the cursor record or its successor: when we reposition
+	the cursor, this record lock guarantees that nobody can meanwhile have
+	inserted new records which should have appeared in the result set,
+	which would result in the phantom problem. */
+
+	plan->stored_cursor_rec_processed = FALSE;
+	btr_pcur_store_position(&(plan->pcur), &mtr);
+
+	err = DB_SUCCESS;
+	goto mtr_commit_exit;
+
+commit_mtr_for_a_while:
+	/* Stores the cursor position and commits &mtr; this is used if
+	&mtr may contain latches which would break the latching order if
+	&mtr would not be committed and the latches released. */
+
+	plan->stored_cursor_rec_processed = TRUE;
+
+	btr_pcur_store_position(&(plan->pcur), &mtr);
+
+	mtr.commit();
+
+	mtr_has_extra_clust_latch = FALSE;
+
+	goto table_loop;
+
+lock_wait_or_error:
+	/* See the note at stop_for_a_while: the same holds for this case */
+
+	ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc);
+
+	plan->stored_cursor_rec_processed = FALSE;
+	btr_pcur_store_position(&(plan->pcur), &mtr);
+mtr_commit_exit:
+	mtr.commit();
+
+func_exit:
+	if (heap != NULL) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/**********************************************************************//**
+Performs a select step. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_sel_step(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	sel_node_t*	node;
+
+	ut_ad(thr);
+
+	node = static_cast<sel_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
+
+	/* If this is a new time this node is executed (or when execution
+	resumes after wait for a table intention lock), set intention locks
+	on the tables, or assign a read view */
+
+	if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
+
+		node->state = SEL_NODE_OPEN;
+	}
+
+	if (node->state == SEL_NODE_OPEN) {
+
+		/* It may be that the current session has not yet started
+		its transaction, or it has been committed: */
+
+		trx_start_if_not_started_xa(thr_get_trx(thr), false);
+
+		plan_reset_cursor(sel_node_get_nth_plan(node, 0));
+
+		if (node->consistent_read) {
+			trx_t *trx = thr_get_trx(thr);
+			/* Assign a read view for the query */
+			trx->read_view.open(trx);
+			node->read_view = trx->read_view.is_open() ?
+					  &trx->read_view : NULL;
+		} else {
+			sym_node_t*	table_node;
+			lock_mode	i_lock_mode;
+
+			if (node->set_x_locks) {
+				i_lock_mode = LOCK_IX;
+			} else {
+				i_lock_mode = LOCK_IS;
+			}
+
+			for (table_node = node->table_list;
+			     table_node != 0;
+			     table_node = static_cast<sym_node_t*>(
+					que_node_get_next(table_node))) {
+
+				dberr_t	err = lock_table(
+					table_node->table, nullptr,
+					i_lock_mode, thr);
+
+				if (err != DB_SUCCESS) {
+					trx_t*	trx;
+
+					trx = thr_get_trx(thr);
+					trx->error_state = err;
+
+					return(NULL);
+				}
+			}
+		}
+
+		/* If this is an explicit cursor, copy stored procedure
+		variable values, so that the values cannot change between
+		fetches (currently, we copy them also for non-explicit
+		cursors) */
+
+		if (node->explicit_cursor
+		    && UT_LIST_GET_FIRST(node->copy_variables)) {
+
+			row_sel_copy_input_variable_vals(node);
+		}
+
+		node->state = SEL_NODE_FETCH;
+		node->fetch_table = 0;
+
+		if (node->is_aggregate) {
+			/* Reset the aggregate total values */
+			sel_reset_aggregate_vals(node);
+		}
+	}
+
+	dberr_t	err = row_sel(node, thr);
+
+	/* NOTE! if queries are parallelized, the following assignment may
+	have problems; the assignment should be made only if thr is the
+	only top-level thr in the graph: */
+
+	thr->graph->last_sel_node = node;
+
+	if (err != DB_SUCCESS) {
+		thr_get_trx(thr)->error_state = err;
+
+		return(NULL);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs a fetch for a cursor.
+@return query thread to run next or NULL */
+que_thr_t*
+fetch_step(
+/*=======*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	sel_node_t*	sel_node;
+	fetch_node_t*	node;
+
+	ut_ad(thr);
+
+	node = static_cast<fetch_node_t*>(thr->run_node);
+	sel_node = node->cursor_def;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
+
+	if (thr->prev_node != que_node_get_parent(node)) {
+
+		if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
+
+			if (node->into_list) {
+				sel_assign_into_var_values(node->into_list,
+							   sel_node);
+			} else {
+				ibool ret = (*node->func->func)(
+					sel_node, node->func->arg);
+
+				if (!ret) {
+					sel_node->state
+						 = SEL_NODE_NO_MORE_ROWS;
+				}
+			}
+		}
+
+		thr->run_node = que_node_get_parent(node);
+
+		return(thr);
+	}
+
+	/* Make the fetch node the parent of the cursor definition for
+	the time of the fetch, so that execution knows to return to this
+	fetch node after a row has been selected or we know that there is
+	no row left */
+
+	sel_node->common.parent = node;
+
+	if (sel_node->state == SEL_NODE_CLOSED) {
+		ib::error() << "fetch called on a closed cursor";
+
+		thr_get_trx(thr)->error_state = DB_ERROR;
+
+		return(NULL);
+	}
+
+	thr->run_node = sel_node;
+
+	return(thr);
+}
+
+/***********************************************************//**
+Prints a row in a select result.
+@return query thread to run next or NULL */
+que_thr_t*
+row_printf_step(
+/*============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	row_printf_node_t*	node;
+	sel_node_t*		sel_node;
+	que_node_t*		arg;
+
+	ut_ad(thr);
+
+	node = static_cast<row_printf_node_t*>(thr->run_node);
+
+	sel_node = node->sel_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+
+		/* Reset the cursor */
+		sel_node->state = SEL_NODE_OPEN;
+
+		/* Fetch next row to print */
+
+		thr->run_node = sel_node;
+
+		return(thr);
+	}
+
+	if (sel_node->state != SEL_NODE_FETCH) {
+
+		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+		/* No more rows to print */
+
+		thr->run_node = que_node_get_parent(node);
+
+		return(thr);
+	}
+
+	arg = sel_node->select_list;
+
+	while (arg) {
+		dfield_print_also_hex(que_node_get_val(arg));
+
+		fputs(" ::: ", stderr);
+
+		arg = que_node_get_next(arg);
+	}
+
+	putc('\n', stderr);
+
+	/* Fetch next row to print */
+
+	thr->run_node = sel_node;
+
+	return(thr);
+}
+
+/****************************************************************//**
+Converts a key value stored in MySQL format to an Innobase dtuple. The last
+field of the key value may be just a prefix of a fixed length field: hence
+the parameter key_len. But currently we do not allow search keys where the
+last field is only a prefix of the full key field len and print a warning if
+such appears. A counterpart of this function is
+ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
+void
+row_sel_convert_mysql_key_to_innobase(
+/*==================================*/
+	dtuple_t*	tuple,		/*!< in/out: tuple where to build;
+					NOTE: we assume that the type info
+					in the tuple is already according
+					to index! */
+	byte*		buf,		/*!< in: buffer to use in field
+					conversions; NOTE that dtuple->data
+					may end up pointing inside buf so
+					do not discard that buffer while
+					the tuple is being used. See
+					row_mysql_store_col_in_innobase_format()
+					in the case of DATA_INT */
+	ulint		buf_len,	/*!< in: buffer length */
+	dict_index_t*	index,		/*!< in: index of the key value */
+	const byte*	key_ptr,	/*!< in: MySQL key value */
+	ulint		key_len)	/*!< in: MySQL key value length */
+{
+	byte*		original_buf	= buf;
+	const byte*	original_key_ptr = key_ptr;
+	dict_field_t*	field;
+	dfield_t*	dfield;
+	ulint		data_offset;
+	ulint		data_len;
+	ulint		data_field_len;
+	ibool		is_null;
+	const byte*	key_end;
+	ulint		n_fields = 0;
+
+	/* For documentation of the key value storage format in MySQL, see
+	ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
+
+	key_end = key_ptr + key_len;
+
+	/* Permit us to access any field in the tuple (ULINT_MAX): */
+
+	dtuple_set_n_fields(tuple, ULINT_MAX);
+
+	dfield = dtuple_get_nth_field(tuple, 0);
+	field = dict_index_get_nth_field(index, 0);
+
+	if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) {
+		/* A special case: we are looking for a position in the
+		generated clustered index which InnoDB automatically added
+		to a table with no primary key: the first and the only
+		ordering column is ROW_ID which InnoDB stored to the key_ptr
+		buffer. */
+
+		ut_a(key_len == DATA_ROW_ID_LEN);
+
+		dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
+
+		dtuple_set_n_fields(tuple, 1);
+
+		return;
+	}
+
+	while (key_ptr < key_end) {
+
+		ulint	type = dfield_get_type(dfield)->mtype;
+		ut_a(field->col->mtype == type);
+
+		data_offset = 0;
+		is_null = FALSE;
+
+		if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
+			/* The first byte in the field tells if this is
+			an SQL NULL value */
+
+			data_offset = 1;
+
+			if (*key_ptr != 0) {
+				dfield_set_null(dfield);
+
+				is_null = TRUE;
+			}
+		}
+
+		/* Calculate data length and data field total length */
+		if (DATA_LARGE_MTYPE(type) || DATA_GEOMETRY_MTYPE(type)) {
+
+			/* For R-tree index, data length should be the
+			total size of the wkb data.*/
+			if (dict_index_is_spatial(index)) {
+				ut_ad(DATA_GEOMETRY_MTYPE(type));
+				data_len = key_len;
+				data_field_len = data_offset + data_len;
+			} else {
+				/* The key field is a column prefix of a BLOB
+				or TEXT. */
+
+				ut_a(field->prefix_len > 0);
+
+				/* MySQL stores the actual data length to the
+				first 2 bytes after the optional SQL NULL
+				marker byte. The storage format is
+				little-endian, that is, the most significant
+				byte at a higher address. In UTF-8, MySQL
+				seems to reserve field->prefix_len bytes for
+				storing this field in the key value buffer,
+				even though the actual value only takes data
+				len bytes from the start. */
+
+				data_len = ulint(key_ptr[data_offset])
+					| ulint(key_ptr[data_offset + 1]) << 8;
+				data_field_len = data_offset + 2
+					+ field->prefix_len;
+
+				data_offset += 2;
+
+				/* Now that we know the length, we store the
+				column value like it would be a fixed char
+				field */
+			}
+
+
+		} else if (field->prefix_len > 0) {
+			/* Looks like MySQL pads unused end bytes in the
+			prefix with space. Therefore, also in UTF-8, it is ok
+			to compare with a prefix containing full prefix_len
+			bytes, and no need to take at most prefix_len / 3
+			UTF-8 characters from the start.
+			If the prefix is used as the upper end of a LIKE
+			'abc%' query, then MySQL pads the end with chars
+			0xff. TODO: in that case does it any harm to compare
+			with the full prefix_len bytes. How do characters
+			0xff in UTF-8 behave? */
+
+			data_len = field->prefix_len;
+			data_field_len = data_offset + data_len;
+		} else {
+			data_len = dfield_get_type(dfield)->len;
+			data_field_len = data_offset + data_len;
+		}
+
+		if ((dtype_get_mysql_type(dfield_get_type(dfield))
+		     == DATA_MYSQL_TRUE_VARCHAR)
+		    && (type != DATA_INT)) {
+			/* In a MySQL key value format, a true VARCHAR is
+			always preceded by 2 bytes of a length field.
+			dfield_get_type(dfield)->len returns the maximum
+			'payload' len in bytes. That does not include the
+			2 bytes that tell the actual data length.
+
+			We added the check != DATA_INT to make sure we do
+			not treat MySQL ENUM or SET as a true VARCHAR! */
+
+			data_len += 2;
+			data_field_len += 2;
+		}
+
+		/* Storing may use at most data_len bytes of buf */
+
+		if (UNIV_LIKELY(!is_null)) {
+			buf = row_mysql_store_col_in_innobase_format(
+					dfield, buf,
+					FALSE, /* MySQL key value format col */
+					key_ptr + data_offset, data_len,
+					dict_table_is_comp(index->table));
+			ut_a(buf <= original_buf + buf_len);
+		}
+
+		key_ptr += data_field_len;
+
+		if (UNIV_UNLIKELY(key_ptr > key_end)) {
+			/* The last field in key was not a complete key field
+			but a prefix of it.
+
+			Print a warning about this! HA_READ_PREFIX_LAST does
+			not currently work in InnoDB with partial-field key
+			value prefixes. Since MySQL currently uses a padding
+			trick to calculate LIKE 'abc%' type queries there
+			should never be partial-field prefixes in searches. */
+
+			ib::warn() << "Using a partial-field key prefix in"
+				" search, index " << index->name
+				<< " of table " << index->table->name
+				<< ". Last data field length "
+				<< data_field_len << " bytes, key ptr now"
+				" exceeds key end by " << (key_ptr - key_end)
+				<< " bytes. Key value in the MariaDB format:";
+
+			ut_print_buf(stderr, original_key_ptr, key_len);
+			putc('\n', stderr);
+
+			if (!is_null) {
+				ulint	len = dfield_get_len(dfield);
+				dfield_set_len(dfield, len
+					       - (ulint) (key_ptr - key_end));
+			}
+			ut_ad(0);
+		}
+
+		n_fields++;
+		field++;
+		dfield++;
+	}
+
+	ut_a(buf <= original_buf + buf_len);
+
+	/* We set the length of tuple to n_fields: we assume that the memory
+	area allocated for it is big enough (usually bigger than n_fields). */
+
+	dtuple_set_n_fields(tuple, n_fields);
+}
+
+/**************************************************************//**
+Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
+function is row_mysql_store_col_in_innobase_format() in row0mysql.cc. */
+void
+row_sel_field_store_in_mysql_format_func(
+	byte*		dest,
+	const mysql_row_templ_t* templ,
+#ifdef UNIV_DEBUG
+	const dict_index_t* index,
+	ulint		field_no,
+#endif /* UNIV_DEBUG */
+	const byte*	data,
+	ulint		len)
+{
+#ifdef UNIV_DEBUG
+	const dict_field_t*	field
+		= templ->is_virtual
+			 ? NULL : dict_index_get_nth_field(index, field_no);
+#endif /* UNIV_DEBUG */
+
+	ut_ad(len != UNIV_SQL_NULL);
+	MEM_CHECK_DEFINED(data, len);
+	MEM_CHECK_ADDRESSABLE(dest, templ->mysql_col_len);
+	MEM_UNDEFINED(dest, templ->mysql_col_len);
+
+	byte* pad = dest + len;
+
+	switch (templ->type) {
+		const byte*	field_end;
+	case DATA_VARCHAR:
+	case DATA_VARMYSQL:
+	case DATA_BINARY:
+		field_end = dest + templ->mysql_col_len;
+
+		if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
+			/* This is a >= 5.0.3 type true VARCHAR. Store the
+			length of the data to the first byte or the first
+			two bytes of dest. */
+
+			dest = row_mysql_store_true_var_len(
+				dest, len, templ->mysql_length_bytes);
+			/* Copy the actual data. Leave the rest of the
+			buffer uninitialized. */
+			memcpy(dest, data, len);
+			break;
+		}
+
+		/* Copy the actual data */
+		memcpy(dest, data, len);
+
+		/* Pad with trailing spaces. */
+
+		if (pad == field_end) {
+			break;
+		}
+
+		if (UNIV_UNLIKELY(templ->type == DATA_FIXBINARY)) {
+			memset(pad, 0, field_end - pad);
+			break;
+		}
+
+		ut_ad(templ->mbminlen <= templ->mbmaxlen);
+
+		/* We treat some Unicode charset strings specially. */
+		switch (templ->mbminlen) {
+		case 4:
+			/* InnoDB should never have stripped partial
+			UTF-32 characters. */
+			ut_a(!(len & 3));
+			break;
+		case 2:
+			/* A space char is two bytes,
+			0x0020 in UCS2 and UTF-16 */
+
+			if (UNIV_UNLIKELY(len & 1)) {
+				/* A 0x20 has been stripped from the column.
+				Pad it back. */
+
+				if (pad < field_end) {
+					*pad++ = 0x20;
+				}
+			}
+		}
+
+		row_mysql_pad_col(templ->mbminlen, pad,
+				  ulint(field_end - pad));
+		break;
+
+	case DATA_BLOB:
+		/* Store a pointer to the BLOB buffer to dest: the BLOB was
+		already copied to the buffer in row_sel_store_mysql_rec */
+
+		row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
+					 len);
+		break;
+
+	case DATA_GEOMETRY:
+		/* We store all geometry data as BLOB data at server layer. */
+		row_mysql_store_geometry(dest, templ->mysql_col_len, data, len);
+		break;
+
+	case DATA_MYSQL:
+		memcpy(dest, data, len);
+
+		ut_ad(templ->mysql_col_len >= len);
+		ut_ad(templ->mbmaxlen >= templ->mbminlen);
+
+		/* If field_no equals to templ->icp_rec_field_no,
+		we are examining a row pointed by "icp_rec_field_no".
+		There is possibility that icp_rec_field_no refers to
+		a field in a secondary index while templ->rec_field_no
+		points to field in a primary index. The length
+		should still be equal, unless the field pointed
+		by icp_rec_field_no has a prefix */
+		ut_ad(templ->mbmaxlen > templ->mbminlen
+		      || templ->mysql_col_len == len
+		      || (field_no == templ->icp_rec_field_no
+			  && field->prefix_len > 0));
+
+		/* The following assertion would fail for old tables
+		containing UTF-8 ENUM columns due to Bug #9526. */
+		ut_ad(!templ->mbmaxlen
+		      || !(templ->mysql_col_len % templ->mbmaxlen));
+		ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len
+		      || (field_no == templ->icp_rec_field_no
+			  && field->prefix_len > 0)
+		      || templ->rec_field_is_prefix);
+
+		ut_ad(templ->is_virtual
+		      || !(field->prefix_len % templ->mbmaxlen));
+
+		if (templ->mbminlen == 1 && templ->mbmaxlen != 1) {
+			/* Pad with spaces. This undoes the stripping
+			done in row0mysql.cc, function
+			row_mysql_store_col_in_innobase_format(). */
+
+			memset(pad, 0x20, templ->mysql_col_len - len);
+		}
+		break;
+
+	default:
+#ifdef UNIV_DEBUG
+	case DATA_SYS_CHILD:
+	case DATA_SYS:
+		/* These column types should never be shipped to MySQL. */
+		ut_ad(0);
+		/* fall through */
+
+	case DATA_CHAR:
+	case DATA_FIXBINARY:
+	case DATA_FLOAT:
+	case DATA_DOUBLE:
+	case DATA_DECIMAL:
+#endif /* UNIV_DEBUG */
+		ut_ad((templ->is_virtual && !field)
+		      || (field && field->prefix_len
+				? field->prefix_len == len
+				: templ->mysql_col_len == len));
+		memcpy(dest, data, len);
+		break;
+
+	case DATA_INT:
+		/* Convert InnoDB big-endian integer to little-endian
+		format, sign bit restored to 2's complement form */
+		DBUG_ASSERT(templ->mysql_col_len == len);
+
+		byte* ptr = pad;
+		do *--ptr = *data++; while (ptr != dest);
+		if (!templ->is_unsigned) {
+			pad[-1] ^= 0x80;
+		}
+	}
+}
+
+/** Convert a field in the Innobase format to a field in the MySQL format.
+@param[out]	mysql_rec		record in the MySQL format
+@param[in,out]	prebuilt		prebuilt struct
+@param[in]	rec			InnoDB record; must be protected
+					by a page latch
+@param[in]	index			index of rec
+@param[in]	offsets			array returned by rec_get_offsets()
+@param[in]	field_no		templ->rec_field_no or
+					templ->clust_rec_field_no
+					or templ->icp_rec_field_no
+@param[in]	templ			row template
+*/
+static MY_ATTRIBUTE((warn_unused_result))
+ibool
+row_sel_store_mysql_field(
+	byte*			mysql_rec,
+	row_prebuilt_t*		prebuilt,
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	const rec_offs*		offsets,
+	ulint			field_no,
+	const mysql_row_templ_t*templ)
+{
+	DBUG_ENTER("row_sel_store_mysql_field_func");
+
+	const byte*	data;
+	ulint		len;
+
+	ut_ad(prebuilt->default_rec);
+	ut_ad(templ);
+	ut_ad(templ >= prebuilt->mysql_template);
+	ut_ad(templ < &prebuilt->mysql_template[prebuilt->n_template]);
+	ut_ad(field_no == templ->clust_rec_field_no
+	      || field_no == templ->rec_field_no
+	      || field_no == templ->icp_rec_field_no);
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, field_no) != 0)) {
+
+		mem_heap_t*	heap;
+		/* Copy an externally stored field to a temporary heap */
+
+		ut_ad(field_no == templ->clust_rec_field_no);
+
+		if (DATA_LARGE_MTYPE(templ->type)) {
+			if (prebuilt->blob_heap == NULL) {
+				prebuilt->blob_heap = mem_heap_create(
+					srv_page_size);
+			}
+
+			heap = prebuilt->blob_heap;
+		} else {
+			heap = mem_heap_create(srv_page_size);
+		}
+
+		/* NOTE: if we are retrieving a big BLOB, we may
+		already run out of memory in the next call, which
+		causes an assert */
+
+		data = btr_rec_copy_externally_stored_field(
+			rec, offsets, prebuilt->table->space->zip_size(),
+			field_no, &len, heap);
+
+		if (UNIV_UNLIKELY(!data)) {
+			/* The externally stored field was not written
+			yet. This record should only be seen by
+			trx_rollback_recovered() or any
+			TRX_ISO_READ_UNCOMMITTED transactions. */
+
+			if (heap != prebuilt->blob_heap) {
+				mem_heap_free(heap);
+			}
+
+			ut_a(prebuilt->trx->isolation_level
+			     == TRX_ISO_READ_UNCOMMITTED);
+			DBUG_RETURN(FALSE);
+		}
+
+		ut_a(len != UNIV_SQL_NULL);
+
+		row_sel_field_store_in_mysql_format(
+			mysql_rec + templ->mysql_col_offset,
+			templ, index, field_no, data, len);
+
+		if (heap != prebuilt->blob_heap) {
+			mem_heap_free(heap);
+		}
+	} else {
+		/* The field is stored in the index record, or
+		in the metadata for instant ADD COLUMN. */
+		data = rec_get_nth_cfield(rec, index, offsets, field_no, &len);
+
+		if (len == UNIV_SQL_NULL) {
+			/* MySQL assumes that the field for an SQL
+			NULL value is set to the default value. */
+			ut_ad(templ->mysql_null_bit_mask);
+
+			MEM_CHECK_DEFINED(prebuilt->default_rec
+					  + templ->mysql_col_offset,
+					  templ->mysql_col_len);
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+			mysql_rec[templ->mysql_null_byte_offset]
+				|= (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+			memcpy(mysql_rec + templ->mysql_col_offset,
+			       (const byte*) prebuilt->default_rec
+			       + templ->mysql_col_offset,
+			       templ->mysql_col_len);
+			DBUG_RETURN(TRUE);
+		}
+
+		if (DATA_LARGE_MTYPE(templ->type)
+		    || DATA_GEOMETRY_MTYPE(templ->type)) {
+
+			/* It is a BLOB field locally stored in the
+			InnoDB record: we MUST copy its contents to
+			prebuilt->blob_heap here because
+			row_sel_field_store_in_mysql_format() stores a
+			pointer to the data, and the data passed to us
+			will be invalid as soon as the
+			mini-transaction is committed and the page
+			latch on the clustered index page is
+			released. */
+
+			if (prebuilt->blob_heap == NULL) {
+				prebuilt->blob_heap = mem_heap_create(
+					srv_page_size);
+				DBUG_PRINT("anna", ("blob_heap allocated: %p",
+						    prebuilt->blob_heap));
+			}
+
+			data = static_cast<byte*>(
+				mem_heap_dup(prebuilt->blob_heap, data, len));
+		}
+
+		row_sel_field_store_in_mysql_format(
+			mysql_rec + templ->mysql_col_offset,
+			templ, index, field_no, data, len);
+	}
+
+	ut_ad(len != UNIV_SQL_NULL);
+
+	if (templ->mysql_null_bit_mask) {
+		/* It is a nullable column with a non-NULL
+		value */
+		mysql_rec[templ->mysql_null_byte_offset]
+			&= static_cast<byte>(~templ->mysql_null_bit_mask);
+	}
+
+	DBUG_RETURN(TRUE);
+}
+
+/** Convert a row in the Innobase format to a row in the MySQL format.
+Note that the template in prebuilt may advise us to copy only a few
+columns to mysql_rec, other columns are left blank. All columns may not
+be needed in the query.
+@param[out]	mysql_rec	row in the MySQL format
+@param[in]	prebuilt	cursor
+@param[in]	rec		Innobase record in the index
+				which was described in prebuilt's
+				template, or in the clustered index;
+				must be protected by a page latch
+@param[in]	vrow		virtual columns
+@param[in]	rec_clust	whether index must be the clustered index
+@param[in]	index		index of rec
+@param[in]	offsets		array returned by rec_get_offsets(rec)
+@retval true on success
+@retval false if not all columns could be retrieved */
+MY_ATTRIBUTE((warn_unused_result))
+static bool row_sel_store_mysql_rec(
+	byte*		mysql_rec,
+	row_prebuilt_t*	prebuilt,
+	const rec_t*	rec,
+	const dtuple_t*	vrow,
+	bool		rec_clust,
+	const dict_index_t* index,
+	const rec_offs*	offsets)
+{
+	DBUG_ENTER("row_sel_store_mysql_rec");
+
+	ut_ad(rec_clust || index == prebuilt->index);
+	ut_ad(!rec_clust || dict_index_is_clust(index));
+
+	if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
+		row_mysql_prebuilt_free_blob_heap(prebuilt);
+	}
+
+	for (ulint i = 0; i < prebuilt->n_template; i++) {
+		const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
+
+		if (templ->is_virtual && dict_index_is_clust(index)) {
+			/* Skip virtual columns if it is not a covered
+			search or virtual key read is not requested. */
+			if (!rec_clust
+			    || !prebuilt->index->has_virtual()
+			    || !prebuilt->read_just_key) {
+				/* Initialize the NULL bit. */
+				if (templ->mysql_null_bit_mask) {
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+					mysql_rec[templ->mysql_null_byte_offset]
+						|= (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+				}
+				continue;
+			}
+
+			dict_v_col_t*   col;
+			col = dict_table_get_nth_v_col(
+				index->table, templ->clust_rec_field_no);
+
+			ut_ad(vrow);
+
+			const dfield_t* dfield = dtuple_get_nth_v_field(
+				vrow, col->v_pos);
+
+			if (dfield_get_type(dfield)->mtype == DATA_MISSING) {
+				ut_ad("no ha_innopart in MariaDB" == 0);
+				continue;
+			}
+
+			if (dfield->len == UNIV_SQL_NULL) {
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+				mysql_rec[templ->mysql_null_byte_offset]
+				|= (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+				memcpy(mysql_rec
+				+ templ->mysql_col_offset,
+				(const byte*) prebuilt->default_rec
+				+ templ->mysql_col_offset,
+				templ->mysql_col_len);
+			} else {
+				row_sel_field_store_in_mysql_format(
+				mysql_rec + templ->mysql_col_offset,
+				templ, index, templ->clust_rec_field_no,
+				(const byte*)dfield->data, dfield->len);
+				if (templ->mysql_null_bit_mask) {
+					mysql_rec[
+						templ->mysql_null_byte_offset]
+						&= static_cast<byte>
+						(~templ->mysql_null_bit_mask);
+				}
+			}
+
+			continue;
+		}
+
+		const ulint		field_no
+			= rec_clust
+			? templ->clust_rec_field_no
+			: templ->rec_field_no;
+		/* We should never deliver column prefixes to the SQL layer,
+		except for evaluating handler_index_cond_check()
+		or handler_rowid_filter_check(). */
+		/* ...actually, we do want to do this in order to
+		support the prefix query optimization.
+
+		ut_ad(dict_index_get_nth_field(index, field_no)->prefix_len
+		      == 0);
+
+		...so we disable this assert. */
+
+		if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
+					       rec, index, offsets,
+					       field_no, templ)) {
+
+			DBUG_RETURN(false);
+		}
+	}
+
+	/* FIXME: We only need to read the doc_id if an FTS indexed
+	column is being updated.
+	NOTE, the record can be cluster or secondary index record.
+	if secondary index is used then FTS_DOC_ID column should be part
+	of this index. */
+	if (dict_table_has_fts_index(prebuilt->table)) {
+		if (dict_index_is_clust(index)
+		    || prebuilt->fts_doc_id_in_read_set) {
+			prebuilt->fts_doc_id = fts_get_doc_id_from_rec(
+				rec, index, offsets);
+		}
+	}
+
+	DBUG_RETURN(true);
+}
+
+static void row_sel_reset_old_vers_heap(row_prebuilt_t *prebuilt)
+{
+  if (prebuilt->old_vers_heap)
+    mem_heap_empty(prebuilt->old_vers_heap);
+  else
+    prebuilt->old_vers_heap= mem_heap_create(200);
+}
+
+/*********************************************************************//**
+Builds a previous version of a clustered index record for a consistent read
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_sel_build_prev_vers_for_mysql(
+/*==============================*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt struct */
+	dict_index_t*	clust_index,	/*!< in: clustered index */
+	const rec_t*	rec,		/*!< in: record in a clustered index */
+	rec_offs**	offsets,	/*!< in/out: offsets returned by
+					rec_get_offsets(rec, clust_index) */
+	mem_heap_t**	offset_heap,	/*!< in/out: memory heap from which
+					the offsets are allocated */
+	rec_t**		old_vers,	/*!< out: old version, or NULL if the
+					record does not exist in the view:
+					i.e., it was freshly inserted
+					afterwards */
+	dtuple_t**	vrow,		/*!< out: dtuple to hold old virtual
+					column data */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	row_sel_reset_old_vers_heap(prebuilt);
+
+	return row_vers_build_for_consistent_read(
+		rec, mtr, clust_index, offsets,
+		&prebuilt->trx->read_view, offset_heap,
+		prebuilt->old_vers_heap, old_vers, vrow);
+}
+
+/** Helper class to cache clust_rec and old_vers */
+class Row_sel_get_clust_rec_for_mysql
+{
+  const rec_t *cached_clust_rec;
+  rec_t *cached_old_vers;
+  lsn_t cached_lsn;
+  page_id_t cached_page_id;
+
+#ifdef UNIV_DEBUG
+  void check_eq(const dict_index_t *index, const rec_offs *offsets) const
+  {
+    rec_offs vers_offs[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS];
+    rec_offs_init(vers_offs);
+    mem_heap_t *heap= nullptr;
+
+    ut_ad(rec_offs_validate(cached_clust_rec, index, offsets));
+    ut_ad(index->first_user_field() <= rec_offs_n_fields(offsets));
+    ut_ad(vers_offs == rec_get_offsets(cached_old_vers, index, vers_offs,
+                                       index->n_core_fields,
+                                       index->db_trx_id(), &heap));
+    ut_ad(!heap);
+    for (auto n= index->db_trx_id(); n--; )
+    {
+      const dict_col_t *col= dict_index_get_nth_col(index, n);
+      ulint len1, len2;
+      const byte *b1= rec_get_nth_field(cached_clust_rec, offsets, n, &len1);
+      const byte *b2= rec_get_nth_field(cached_old_vers, vers_offs, n, &len2);
+      ut_ad(!cmp_data(col->mtype, col->prtype, false, b1, len1, b2, len2));
+    }
+  }
+#endif
+
+public:
+  Row_sel_get_clust_rec_for_mysql() :
+    cached_clust_rec(NULL), cached_old_vers(NULL), cached_lsn(0),
+    cached_page_id(page_id_t(0,0)) {}
+
+  dberr_t operator()(row_prebuilt_t *prebuilt, dict_index_t *sec_index,
+                     const rec_t *rec, que_thr_t *thr, const rec_t **out_rec,
+                     rec_offs **offsets, mem_heap_t **offset_heap,
+                     dtuple_t **vrow, mtr_t *mtr);
+};
+
+/*********************************************************************//**
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking. Used in the MySQL
+interface.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+dberr_t
+Row_sel_get_clust_rec_for_mysql::operator()(
+/*============================*/
+	row_prebuilt_t*	prebuilt,/*!< in: prebuilt struct in the handle */
+	dict_index_t*	sec_index,/*!< in: secondary index where rec resides */
+	const rec_t*	rec,	/*!< in: record in a non-clustered index; if
+				this is a locking read, then rec is not
+				allowed to be delete-marked, and that would
+				not make sense either */
+	que_thr_t*	thr,	/*!< in: query thread */
+	const rec_t**	out_rec,/*!< out: clustered record or an old version of
+				it, NULL if the old version did not exist
+				in the read view, i.e., it was a fresh
+				inserted version */
+	rec_offs**	offsets,/*!< in: offsets returned by
+				rec_get_offsets(rec, sec_index);
+				out: offsets returned by
+				rec_get_offsets(out_rec, clust_index) */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	dtuple_t**	vrow,	/*!< out: virtual column to fill */
+	mtr_t*		mtr)	/*!< in: mtr used to get access to the
+				non-clustered record; the same mtr is used to
+				access the clustered index */
+{
+	dict_index_t*	clust_index;
+	rec_t*		old_vers;
+	trx_t*		trx;
+
+	prebuilt->clust_pcur->old_rec = nullptr;
+	*out_rec = NULL;
+	trx = thr_get_trx(thr);
+
+	row_build_row_ref_in_tuple(prebuilt->clust_ref, rec,
+				   sec_index, *offsets);
+
+	clust_index = dict_table_get_first_index(sec_index->table);
+	prebuilt->clust_pcur->btr_cur.page_cur.index = clust_index;
+
+	dberr_t err = btr_pcur_open_with_no_init(prebuilt->clust_ref,
+						 PAGE_CUR_LE, BTR_SEARCH_LEAF,
+						 prebuilt->clust_pcur, mtr);
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		return err;
+	}
+
+	const rec_t* clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
+
+	prebuilt->clust_pcur->trx_if_known = trx;
+
+	/* Note: only if the search ends up on a non-infimum record is the
+	low_match value the real match to the search tuple */
+
+	if (!page_rec_is_user_rec(clust_rec)
+	    || btr_pcur_get_low_match(prebuilt->clust_pcur)
+	    < dict_index_get_n_unique(clust_index)) {
+		btr_cur_t*	btr_cur = btr_pcur_get_btr_cur(prebuilt->pcur);
+
+		/* If this is a spatial index scan, and we are reading
+		from a shadow buffer, the record could be already
+		deleted (due to rollback etc.). So get the original
+		page and verify that */
+		if  (dict_index_is_spatial(sec_index)
+		     && btr_cur->rtr_info->matches
+		     && (page_align(rec)
+			== btr_cur->rtr_info->matches->block.page.frame
+			|| rec != btr_pcur_get_rec(prebuilt->pcur))) {
+#ifdef UNIV_DEBUG
+			rtr_info_t*	rtr_info = btr_cur->rtr_info;
+			mysql_mutex_lock(&rtr_info->matches->rtr_match_mutex);
+			/* The page could be deallocated (by rollback etc.) */
+			if (!rtr_info->matches->valid) {
+				mysql_mutex_unlock(&rtr_info->matches->rtr_match_mutex);
+				clust_rec = NULL;
+                                goto func_exit;
+			}
+			mysql_mutex_unlock(&rtr_info->matches->rtr_match_mutex);
+
+			if (rec_get_deleted_flag(rec,
+                                          dict_table_is_comp(sec_index->table))
+                                  && prebuilt->select_lock_type == LOCK_NONE) {
+
+				clust_rec = NULL;
+				goto func_exit;
+			}
+
+			if (rec != btr_pcur_get_rec(prebuilt->pcur)) {
+				clust_rec = NULL;
+                                goto func_exit;
+			}
+
+			/* FIXME: Why is this block not the
+			same as btr_pcur_get_block(prebuilt->pcur),
+			and is it not unsafe to use RW_NO_LATCH here? */
+			buf_block_t*	block = buf_page_get_gen(
+				btr_pcur_get_block(prebuilt->pcur)->page.id(),
+				btr_pcur_get_block(prebuilt->pcur)->zip_size(),
+				RW_NO_LATCH, NULL, BUF_GET, mtr, &err);
+			ut_ad(block); // FIXME: avoid crash
+			mem_heap_t*	heap = mem_heap_create(256);
+			dtuple_t*       tuple = dict_index_build_data_tuple(
+				rec, sec_index, true,
+				sec_index->n_fields, heap);
+			page_cur_t     page_cursor;
+			page_cursor.block = block;
+			page_cursor.index = sec_index;
+			ulint up_match = 0, low_match = 0;
+			ut_ad(!page_cur_search_with_match(tuple, PAGE_CUR_LE,
+							  &up_match,
+							  &low_match,
+							  &page_cursor,
+							  nullptr));
+			ut_ad(low_match < dtuple_get_n_fields_cmp(tuple));
+			mem_heap_free(heap);
+			err = DB_SUCCESS;
+#endif /* UNIV_DEBUG */
+		} else if (!rec_get_deleted_flag(rec,
+					  dict_table_is_comp(sec_index->table))
+			   || prebuilt->select_lock_type != LOCK_NONE) {
+			/* In a rare case it is possible that no clust
+			rec is found for a delete-marked secondary index
+			record: if row_undo_mod_clust() has already removed
+			the clust rec, while purge is still cleaning and
+			removing secondary index records associated with
+			earlier versions of the clustered index record.
+			In that case we know that the clustered index
+			record did not exist in the read view of trx. */
+			ib::error() << "Clustered record for sec rec not found"
+				" index " << sec_index->name
+				<< " of table " << sec_index->table->name;
+
+			fputs("InnoDB: sec index record ", stderr);
+			rec_print(stderr, rec, sec_index);
+			fputs("\n"
+			      "InnoDB: clust index record ", stderr);
+			rec_print(stderr, clust_rec, clust_index);
+			err = DB_CORRUPTION;
+		}
+
+		clust_rec = NULL;
+		goto func_exit;
+	}
+
+	*offsets = rec_get_offsets(clust_rec, clust_index, *offsets,
+				   clust_index->n_core_fields,
+				   ULINT_UNDEFINED, offset_heap);
+
+	if (prebuilt->select_lock_type != LOCK_NONE) {
+		/* Try to place a lock on the index record; we are searching
+		the clust rec with a unique condition, hence
+		we set a LOCK_REC_NOT_GAP type lock */
+
+		err = lock_clust_rec_read_check_and_lock(
+			0, btr_pcur_get_block(prebuilt->clust_pcur),
+			clust_rec, clust_index, *offsets,
+			prebuilt->select_lock_type,
+			LOCK_REC_NOT_GAP,
+			thr);
+
+		switch (err) {
+		case DB_SUCCESS:
+		case DB_SUCCESS_LOCKED_REC:
+			break;
+		default:
+			return err;
+		}
+	} else {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		old_vers = NULL;
+
+		if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED
+		    || clust_index->table->is_temporary()) {
+		} else {
+			/* If the isolation level allows reading of
+			uncommitted data, then we never look for an
+			earlier version */
+			err = row_sel_clust_sees(clust_rec, *clust_index,
+						 *offsets, trx->read_view);
+		}
+
+		switch (err) {
+		default:
+			return err;
+		case DB_SUCCESS:
+			break;
+		case DB_SUCCESS_LOCKED_REC:
+			const buf_page_t& bpage = btr_pcur_get_block(
+				prebuilt->clust_pcur)->page;
+
+			const lsn_t lsn = mach_read_from_8(
+				page_align(clust_rec) + FIL_PAGE_LSN);
+
+			if (lsn != cached_lsn
+			    || bpage.id() != cached_page_id
+			    || clust_rec != cached_clust_rec) {
+				/* The following call returns 'offsets' associated with
+				'old_vers' */
+				err = row_sel_build_prev_vers_for_mysql(
+					prebuilt, clust_index,
+					clust_rec, offsets, offset_heap, &old_vers,
+					vrow, mtr);
+
+				if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+					return err;
+				}
+				cached_lsn = lsn;
+				cached_page_id = bpage.id();
+				cached_clust_rec = clust_rec;
+				cached_old_vers = old_vers;
+			} else {
+				err = DB_SUCCESS;
+				old_vers = cached_old_vers;
+
+				/* The offsets need not be same for the latest
+				version of clust_rec and its old version
+				old_vers. Re-calculate the offsets for old_vers. */
+
+				if (old_vers) {
+					ut_d(check_eq(clust_index, *offsets));
+					*offsets = rec_get_offsets(
+						old_vers, clust_index, *offsets,
+						clust_index->n_core_fields,
+						ULINT_UNDEFINED, offset_heap);
+				}
+			}
+
+			if (old_vers == NULL) {
+				return err;
+			}
+
+			clust_rec = old_vers;
+		}
+
+		/* If we had to go to an earlier version of row or the
+		secondary index record is delete marked, then it may be that
+		the secondary index record corresponding to clust_rec
+		(or old_vers) is not rec; in that case we must ignore
+		such row because in our snapshot rec would not have existed.
+		Remember that from rec we cannot see directly which transaction
+		id corresponds to it: we have to go to the clustered index
+		record. A query where we want to fetch all rows where
+		the secondary index value is in some interval would return
+		a wrong result if we would not drop rows which we come to
+		visit through secondary index records that would not really
+		exist in our snapshot. */
+
+		/* And for spatial index, since the rec is from shadow buffer,
+		so we need to check if it's exactly match the clust_rec. */
+		if (clust_rec
+		    && (old_vers
+			|| trx->isolation_level <= TRX_ISO_READ_UNCOMMITTED
+			|| dict_index_is_spatial(sec_index)
+			|| rec_get_deleted_flag(rec, dict_table_is_comp(
+							sec_index->table)))) {
+			err = row_sel_sec_rec_is_for_clust_rec(rec, sec_index,
+						clust_rec, clust_index, thr);
+			switch (err) {
+			case DB_SUCCESS:
+				clust_rec = NULL;
+				break;
+			case DB_SUCCESS_LOCKED_REC:
+				break;
+			default:
+				return err;
+			}
+		}
+
+		err = DB_SUCCESS;
+	}
+
+func_exit:
+	*out_rec = clust_rec;
+
+	if (prebuilt->select_lock_type != LOCK_NONE) {
+		/* We may use the cursor in update or in unlock_row():
+		store its position */
+
+		btr_pcur_store_position(prebuilt->clust_pcur, mtr);
+	}
+
+	return err;
+}
+
+/** Restores cursor position after it has been stored. We have to take into
+account that the record cursor was positioned on may have been deleted.
+Then we may have to move the cursor one step up or down.
+@param[out] same_user_rec true if we were able to restore the cursor on a user
+record with the same ordering prefix in in the B-tree index
+@param[in] latch_mode latch mode wished in restoration
+@param[in] pcur cursor whose position has been stored
+@param[in] moves_up true if the cursor moves up in the index
+@param[in,out] mtr mtr; CAUTION: may commit mtr temporarily!
+@return true if we may need to process the record the cursor is now
+positioned on (i.e. we should not go to the next record yet) */
+static bool sel_restore_position_for_mysql(bool *same_user_rec,
+                                           btr_latch_mode latch_mode,
+                                           btr_pcur_t *pcur,
+                                           bool moves_up, mtr_t *mtr)
+{
+	auto status = pcur->restore_position(latch_mode, mtr);
+
+	*same_user_rec = status == btr_pcur_t::SAME_ALL;
+
+	ut_ad(!*same_user_rec || pcur->rel_pos == BTR_PCUR_ON);
+#ifdef UNIV_DEBUG
+	if (pcur->pos_state == BTR_PCUR_IS_POSITIONED_OPTIMISTIC) {
+		ut_ad(pcur->rel_pos == BTR_PCUR_BEFORE
+		      || pcur->rel_pos == BTR_PCUR_AFTER);
+	} else {
+		ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+		ut_ad((pcur->rel_pos == BTR_PCUR_ON)
+		      == btr_pcur_is_on_user_rec(pcur));
+	}
+#endif /* UNIV_DEBUG */
+
+	/* The position may need be adjusted for rel_pos and moves_up. */
+
+	switch (pcur->rel_pos) {
+	case BTR_PCUR_ON:
+		if (!*same_user_rec && moves_up) {
+			if (status == btr_pcur_t::SAME_UNIQ)
+			  return true;
+next:
+			if (btr_pcur_move_to_next(pcur, mtr)
+			    && rec_is_metadata(btr_pcur_get_rec(pcur),
+					       *pcur->index())) {
+				btr_pcur_move_to_next(pcur, mtr);
+			}
+
+			return true;
+		}
+		return(!*same_user_rec);
+	case BTR_PCUR_AFTER_LAST_IN_TREE:
+	case BTR_PCUR_BEFORE_FIRST_IN_TREE:
+		return true;
+	case BTR_PCUR_AFTER:
+		/* positioned to record after pcur->old_rec. */
+		pcur->pos_state = BTR_PCUR_IS_POSITIONED;
+prev:
+		if (btr_pcur_is_on_user_rec(pcur) && !moves_up
+		    && !rec_is_metadata(btr_pcur_get_rec(pcur),
+					*pcur->index())) {
+			if (!btr_pcur_move_to_prev(pcur, mtr)) {
+				return true;
+			}
+		}
+		return true;
+	case BTR_PCUR_BEFORE:
+		/* For non optimistic restoration:
+		The position is now set to the record before pcur->old_rec.
+
+		For optimistic restoration:
+		The position also needs to take the previous search_mode into
+		consideration. */
+
+		switch (pcur->pos_state) {
+		case BTR_PCUR_IS_POSITIONED_OPTIMISTIC:
+			pcur->pos_state = BTR_PCUR_IS_POSITIONED;
+			if (pcur->search_mode == PAGE_CUR_GE) {
+				/* Positioned during Greater or Equal search
+				with BTR_PCUR_BEFORE. Optimistic restore to
+				the same record. If scanning for lower then
+				we must move to previous record.
+				This can happen with:
+				HANDLER READ idx a = (const);
+				HANDLER READ idx PREV; */
+				goto prev;
+			}
+			return true;
+		case BTR_PCUR_IS_POSITIONED:
+			if (moves_up && btr_pcur_is_on_user_rec(pcur)) {
+				goto next;
+			}
+			return true;
+		case BTR_PCUR_WAS_POSITIONED:
+		case BTR_PCUR_NOT_POSITIONED:
+			break;
+		}
+	}
+	ut_ad(0);
+	return true;
+}
+
+/********************************************************************//**
+Copies a cached field for MySQL from the fetch cache. */
+static
+void
+row_sel_copy_cached_field_for_mysql(
+/*================================*/
+	byte*			buf,	/*!< in/out: row buffer */
+	const byte*		cache,	/*!< in: cached row */
+	const mysql_row_templ_t*templ)	/*!< in: column template */
+{
+	ulint	len;
+
+	buf += templ->mysql_col_offset;
+	cache += templ->mysql_col_offset;
+
+	MEM_CHECK_ADDRESSABLE(buf, templ->mysql_col_len);
+
+	if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR
+	    && (templ->type != DATA_INT)) {
+		/* Check for != DATA_INT to make sure we do
+		not treat MySQL ENUM or SET as a true VARCHAR!
+		Find the actual length of the true VARCHAR field. */
+		row_mysql_read_true_varchar(
+			&len, cache, templ->mysql_length_bytes);
+		len += templ->mysql_length_bytes;
+		MEM_UNDEFINED(buf, templ->mysql_col_len);
+	} else {
+		len = templ->mysql_col_len;
+	}
+
+	memcpy(buf, cache, len);
+}
+
+/** Copy used fields from cached row.
+Copy cache record field by field, don't touch fields that
+are not covered by current key.
+@param[out]	buf		Where to copy the MySQL row.
+@param[in]	cached_rec	What to copy (in MySQL row format).
+@param[in]	prebuilt	prebuilt struct. */
+void
+row_sel_copy_cached_fields_for_mysql(
+	byte*		buf,
+	const byte*	cached_rec,
+	row_prebuilt_t*	prebuilt)
+{
+	const mysql_row_templ_t*templ;
+	ulint			i;
+	for (i = 0; i < prebuilt->n_template; i++) {
+		templ = prebuilt->mysql_template + i;
+
+		/* Skip virtual columns */
+		if (templ->is_virtual) {
+			continue;
+		}
+
+		row_sel_copy_cached_field_for_mysql(
+			buf, cached_rec, templ);
+		/* Copy NULL bit of the current field from cached_rec
+		to buf */
+		if (templ->mysql_null_bit_mask) {
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+			buf[templ->mysql_null_byte_offset]
+				^= (buf[templ->mysql_null_byte_offset]
+				    ^ cached_rec[templ->mysql_null_byte_offset])
+				& (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+		}
+	}
+}
+
+/********************************************************************//**
+Pops a cached row for MySQL from the fetch cache. */
+UNIV_INLINE
+void
+row_sel_dequeue_cached_row_for_mysql(
+/*=================================*/
+	byte*		buf,		/*!< in/out: buffer where to copy the
+					row */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct */
+{
+	ulint			i;
+	const mysql_row_templ_t*templ;
+	const byte*		cached_rec;
+	ut_ad(prebuilt->n_fetch_cached > 0);
+	ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len);
+
+	MEM_CHECK_ADDRESSABLE(buf, prebuilt->mysql_row_len);
+
+	cached_rec = prebuilt->fetch_cache[prebuilt->fetch_cache_first];
+
+	if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) {
+		row_sel_copy_cached_fields_for_mysql(buf, cached_rec, prebuilt);
+	} else if (prebuilt->mysql_prefix_len > 63) {
+		/* The record is long. Copy it field by field, in case
+		there are some long VARCHAR column of which only a
+		small length is being used. */
+		MEM_UNDEFINED(buf, prebuilt->mysql_prefix_len);
+
+		/* First copy the NULL bits. */
+		memcpy(buf, cached_rec, prebuilt->null_bitmap_len);
+		/* Then copy the requested fields. */
+
+		for (i = 0; i < prebuilt->n_template; i++) {
+			templ = prebuilt->mysql_template + i;
+
+			/* Skip virtual columns */
+			if (templ->is_virtual
+			    && !(dict_index_has_virtual(prebuilt->index)
+				 && prebuilt->read_just_key)) {
+				continue;
+			}
+
+			row_sel_copy_cached_field_for_mysql(
+				buf, cached_rec, templ);
+		}
+	} else {
+		memcpy(buf, cached_rec, prebuilt->mysql_prefix_len);
+	}
+
+	prebuilt->n_fetch_cached--;
+	prebuilt->fetch_cache_first++;
+
+	if (prebuilt->n_fetch_cached == 0) {
+		prebuilt->fetch_cache_first = 0;
+	}
+}
+
+/********************************************************************//**
+Initialise the prefetch cache. */
+UNIV_INLINE
+void
+row_sel_prefetch_cache_init(
+/*========================*/
+	row_prebuilt_t*	prebuilt)	/*!< in/out: prebuilt struct */
+{
+	ulint	i;
+	ulint	sz;
+	byte*	ptr;
+
+	/* Reserve space for the magic number. */
+	sz = UT_ARR_SIZE(prebuilt->fetch_cache) * (prebuilt->mysql_row_len + 8);
+	ptr = static_cast<byte*>(ut_malloc_nokey(sz));
+
+	for (i = 0; i < UT_ARR_SIZE(prebuilt->fetch_cache); i++) {
+
+		/* A user has reported memory corruption in these
+		buffers in Linux. Put magic numbers there to help
+		to track a possible bug. */
+
+		mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
+		ptr += 4;
+
+		prebuilt->fetch_cache[i] = ptr;
+		ptr += prebuilt->mysql_row_len;
+
+		mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
+		ptr += 4;
+	}
+}
+
+/********************************************************************//**
+Get the last fetch cache buffer from the queue.
+@return pointer to buffer. */
+UNIV_INLINE
+byte*
+row_sel_fetch_last_buf(
+/*===================*/
+	row_prebuilt_t*	prebuilt)	/*!< in/out: prebuilt struct */
+{
+	ut_ad(!prebuilt->templ_contains_blob);
+	ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
+
+	if (prebuilt->fetch_cache[0] == NULL) {
+		/* Allocate memory for the fetch cache */
+		ut_ad(prebuilt->n_fetch_cached == 0);
+
+		row_sel_prefetch_cache_init(prebuilt);
+	}
+
+	ut_ad(prebuilt->fetch_cache_first == 0);
+	MEM_UNDEFINED(prebuilt->fetch_cache[prebuilt->n_fetch_cached],
+		      prebuilt->mysql_row_len);
+
+	return(prebuilt->fetch_cache[prebuilt->n_fetch_cached]);
+}
+
+/********************************************************************//**
+Pushes a row for MySQL to the fetch cache. */
+UNIV_INLINE
+void
+row_sel_enqueue_cache_row_for_mysql(
+/*================================*/
+	byte*		mysql_rec,	/*!< in/out: MySQL record */
+	row_prebuilt_t*	prebuilt)	/*!< in/out: prebuilt struct */
+{
+	/* For non ICP code path the row should already exist in the
+	next fetch cache slot. */
+
+	if (prebuilt->pk_filter || prebuilt->idx_cond) {
+		memcpy(row_sel_fetch_last_buf(prebuilt), mysql_rec,
+		       prebuilt->mysql_row_len);
+	}
+
+	++prebuilt->n_fetch_cached;
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/*********************************************************************//**
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always). We assume that the search
+mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
+btr search latch has been locked in S-mode if AHI is enabled.
+@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+static
+ulint
+row_sel_try_search_shortcut_for_mysql(
+/*==================================*/
+	const rec_t**	out_rec,/*!< out: record if found */
+	row_prebuilt_t*	prebuilt,/*!< in: prebuilt struct */
+	rec_offs**	offsets,/*!< in/out: for rec_get_offsets(*out_rec) */
+	mem_heap_t**	heap,	/*!< in/out: heap for rec_get_offsets() */
+	mtr_t*		mtr)	/*!< in: started mtr */
+{
+	dict_index_t*	index		= prebuilt->index;
+	const dtuple_t*	search_tuple	= prebuilt->search_tuple;
+	btr_pcur_t*	pcur		= prebuilt->pcur;
+	trx_t*		trx		= prebuilt->trx;
+	const rec_t*	rec;
+
+	ut_ad(index->is_primary());
+	ut_ad(!index->table->is_temporary());
+	ut_ad(!prebuilt->templ_contains_blob);
+	ut_ad(trx->read_view.is_open());
+	pcur->old_rec = nullptr;
+
+	if (btr_pcur_open_with_no_init(search_tuple, PAGE_CUR_GE,
+				       BTR_SEARCH_LEAF, pcur, mtr)
+	    != DB_SUCCESS) {
+		return SEL_RETRY;
+	}
+
+	rec = btr_pcur_get_rec(pcur);
+
+	if (!page_rec_is_user_rec(rec) || rec_is_metadata(rec, *index)) {
+		return SEL_RETRY;
+	}
+
+	/* As the cursor is now placed on a user record after a search with
+	the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+	fields in the user record matched to the search tuple */
+
+	if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
+		return SEL_EXHAUSTED;
+	}
+
+	if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
+	} else if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
+		/* See row_search_mvcc() for a comment on bulk_trx_id */
+		if (!trx->read_view.changes_visible(bulk_trx_id)) {
+			return SEL_EXHAUSTED;
+		}
+	}
+
+	/* This is a non-locking consistent read: if necessary, fetch
+	a previous version of the record */
+
+	*offsets = rec_get_offsets(rec, index, *offsets, index->n_core_fields,
+				   ULINT_UNDEFINED, heap);
+
+	if (row_sel_clust_sees(rec, *index, *offsets, trx->read_view)
+	    != DB_SUCCESS) {
+		return SEL_RETRY;
+	}
+
+	if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) {
+		/* In delete-marked records, DB_TRX_ID must
+		always refer to an existing undo log record. */
+		ut_ad(row_get_rec_trx_id(rec, index, *offsets));
+		return SEL_EXHAUSTED;
+	}
+
+	*out_rec = rec;
+
+	return SEL_FOUND;
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/*********************************************************************//**
+Check a pushed-down index condition.
+@return CHECK_ABORTED_BY_USER, CHECK_NEG, CHECK_POS, or CHECK_OUT_OF_RANGE */
+static
+check_result_t
+row_search_idx_cond_check(
+/*======================*/
+	byte*			mysql_rec,	/*!< out: record
+						in MySQL format (invalid unless
+						prebuilt->idx_cond!=NULL and
+						we return ICP_MATCH) */
+	row_prebuilt_t*		prebuilt,	/*!< in/out: prebuilt struct
+						for the table handle */
+	const rec_t*		rec,		/*!< in: InnoDB record */
+	const rec_offs*		offsets)	/*!< in: rec_get_offsets() */
+{
+	ulint		i;
+
+	ut_ad(rec_offs_validate(rec, prebuilt->index, offsets));
+
+	if (!prebuilt->idx_cond) {
+		if (!handler_rowid_filter_is_active(prebuilt->pk_filter)) {
+			return(CHECK_POS);
+		}
+	} else {
+		MONITOR_INC(MONITOR_ICP_ATTEMPTS);
+	}
+
+	/* Convert to MySQL format those fields that are needed for
+	evaluating the index condition. */
+
+	if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
+		mem_heap_empty(prebuilt->blob_heap);
+	}
+
+	for (i = 0; i < prebuilt->idx_cond_n_cols; i++) {
+		const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
+
+		/* Skip virtual columns */
+		if (templ->is_virtual) {
+			continue;
+		}
+
+		if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
+					       rec, prebuilt->index, offsets,
+					       templ->icp_rec_field_no,
+					       templ)) {
+			return(CHECK_NEG);
+		}
+	}
+
+	/* We assume that the index conditions on
+	case-insensitive columns are case-insensitive. The
+	case of such columns may be wrong in a secondary
+	index, if the case of the column has been updated in
+	the past, or a record has been deleted and a record
+	inserted in a different case. */
+	check_result_t result = prebuilt->idx_cond
+		? handler_index_cond_check(prebuilt->idx_cond)
+		: CHECK_POS;
+
+	switch (result) {
+	case CHECK_POS:
+	        if (handler_rowid_filter_is_active(prebuilt->pk_filter)) {
+		        ut_ad(!prebuilt->index->is_primary());
+		        if (prebuilt->clust_index_was_generated) {
+                               ulint len;
+                               dict_index_t* index = prebuilt->index;
+                               const byte* data = rec_get_nth_field(
+                                       rec, offsets, index->n_fields - 1,
+                                       &len);
+                               ut_ad(dict_index_get_nth_col(index,
+                                                            index->n_fields - 1)
+                                     ->prtype == (DATA_ROW_ID | DATA_NOT_NULL));
+                               ut_ad(len == DATA_ROW_ID_LEN);
+                               memcpy(prebuilt->row_id, data, DATA_ROW_ID_LEN);
+                        }
+                        result = handler_rowid_filter_check(prebuilt->pk_filter);
+                        switch (result) {
+                        case CHECK_NEG:
+			        MONITOR_INC(MONITOR_ICP_NO_MATCH);
+                                return(result);
+                        case CHECK_OUT_OF_RANGE:
+                                MONITOR_INC(MONITOR_ICP_OUT_OF_RANGE);
+                                return(result);
+                        case CHECK_POS:
+                                break;
+                        default:
+                                return(result);
+                        }
+		}
+		/* Convert the remaining fields to MySQL format.
+		If this is a secondary index record, we must defer
+		this until we have fetched the clustered index record. */
+		if (!prebuilt->need_to_access_clustered
+		    || dict_index_is_clust(prebuilt->index)) {
+			if (!row_sel_store_mysql_rec(
+				    mysql_rec, prebuilt, rec, NULL, false,
+				    prebuilt->index, offsets)) {
+				ut_ad(dict_index_is_clust(prebuilt->index));
+				return(CHECK_NEG);
+			}
+		}
+		MONITOR_INC(MONITOR_ICP_MATCH);
+		return(result);
+	case CHECK_NEG:
+		MONITOR_INC(MONITOR_ICP_NO_MATCH);
+		return(result);
+	case CHECK_OUT_OF_RANGE:
+		MONITOR_INC(MONITOR_ICP_OUT_OF_RANGE);
+		return(result);
+        case CHECK_ERROR:
+        case CHECK_ABORTED_BY_USER:
+		return(result);
+	}
+
+	ut_error;
+	return(result);
+}
+
+/** Extract virtual column data from a virtual index record and fill a dtuple
+@param[in]	rec		the virtual (secondary) index record
+@param[in]	index		the virtual index
+@param[in,out]	vrow		the dtuple where data extract to
+@param[in]	heap		memory heap to allocate memory
+*/
+static
+void
+row_sel_fill_vrow(
+	const rec_t*		rec,
+	dict_index_t*		index,
+	dtuple_t**		vrow,
+	mem_heap_t*		heap)
+{
+	rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs* offsets	= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(!(*vrow));
+	ut_ad(heap);
+	ut_ad(!dict_index_is_clust(index));
+	ut_ad(!index->is_instant());
+	ut_ad(page_rec_is_leaf(rec));
+
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+
+	*vrow = dtuple_create_with_vcol(
+		heap, 0, dict_table_get_n_v_cols(index->table));
+
+	/* Initialize all virtual row's mtype to DATA_MISSING */
+	dtuple_init_v_fld(*vrow);
+
+	for (ulint i = 0; i < dict_index_get_n_fields(index); i++) {
+		const dict_field_t*     field;
+		const dict_col_t*       col;
+
+		field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(field);
+
+		if (col->is_virtual()) {
+			const byte*     data;
+			ulint           len;
+
+			data = rec_get_nth_field(rec, offsets, i, &len);
+
+			const dict_v_col_t*     vcol = reinterpret_cast<
+				const dict_v_col_t*>(col);
+
+			dfield_t* dfield = dtuple_get_nth_v_field(
+				*vrow, vcol->v_pos);
+			dfield_set_data(dfield, data, len);
+			dict_col_copy_type(col, dfield_get_type(dfield));
+		}
+	}
+}
+
+/** Return the record field length in characters.
+@param[in]	col		table column of the field
+@param[in]	field_no	field number
+@param[in]	rec		physical record
+@param[in]	offsets		field offsets in the physical record
+@return field length in characters. */
+static
+size_t
+rec_field_len_in_chars(
+	const dict_col_t*	col,
+	const ulint		field_no,
+	const rec_t*		rec,
+	const rec_offs*		offsets)
+{
+	const ulint cset = dtype_get_charset_coll(col->prtype);
+	const CHARSET_INFO* cs = all_charsets[cset];
+	ulint rec_field_len;
+	const char* rec_field = reinterpret_cast<const char *>(
+		rec_get_nth_field(
+			rec, offsets, field_no, &rec_field_len));
+
+	if (UNIV_UNLIKELY(!cs)) {
+		ib::warn() << "Missing collation " << cset;
+		return SIZE_T_MAX;
+	}
+
+	return cs->numchars(rec_field, rec_field + rec_field_len);
+}
+
+/** Avoid the clustered index lookup if all the following conditions
+are true:
+1) all columns are in secondary index
+2) all values for columns that are prefix-only indexes are shorter
+than the prefix size. This optimization can avoid many IOs for certain schemas.
+@return true, to avoid clustered index lookup. */
+static
+bool row_search_with_covering_prefix(
+	row_prebuilt_t*	prebuilt,
+	const rec_t*	rec,
+	const rec_offs*	offsets)
+{
+	const dict_index_t*	index = prebuilt->index;
+	ut_ad(!dict_index_is_clust(index));
+
+	/* In ha_innobase::build_template() we choose to access the
+	whole row when using exclusive row locks or In case of fts
+	query, we need to read from clustered index */
+	if (prebuilt->select_lock_type == LOCK_X || prebuilt->in_fts_query
+	    || !index->is_btree()) {
+		return false;
+	}
+
+	/** Optimization only applicable if there the number of secondary index
+	fields are greater than or equal to number of clustered index fields. */
+	if (prebuilt->n_template > index->n_fields) {
+		return false;
+	}
+
+	/* We can avoid a clustered index lookup if
+	all of the following hold:
+	(1) all columns are in the secondary index
+	(2) all values for columns that are prefix-only
+	indexes are shorter than the prefix size
+	This optimization can avoid many IOs for certain schemas. */
+	for (ulint i = 0; i < prebuilt->n_template; i++) {
+		mysql_row_templ_t* templ = prebuilt->mysql_template + i;
+		ulint j = templ->rec_prefix_field_no;
+		ut_ad(!templ->mbminlen == !templ->mbmaxlen);
+
+		/** Condition (1) : is the field in the index. */
+		if (j == ULINT_UNDEFINED) {
+			return false;
+		}
+
+		/** Condition (2): If this is a prefix index then
+		row's value size shorter than prefix length. */
+
+		if (!templ->rec_field_is_prefix
+		    || rec_offs_nth_sql_null(offsets, j)) {
+			continue;
+		}
+
+		const dict_field_t* field = dict_index_get_nth_field(index, j);
+
+		if (!field->prefix_len) {
+			continue;
+		}
+
+		const ulint rec_size = rec_offs_nth_size(offsets, j);
+
+		if (rec_size >= field->prefix_len) {
+			/* Shortest representation string by the
+			byte length of the record is longer than the
+			maximum possible index prefix. */
+			return false;
+		}
+
+		if (templ->mbminlen != templ->mbmaxlen
+		    && rec_field_len_in_chars(field->col, j, rec, offsets)
+		    >= field->prefix_len / templ->mbmaxlen) {
+			/* No of chars to store the record exceeds
+			the index prefix character length. */
+			return false;
+		}
+	}
+
+	/* If prefix index optimization condition satisfied then
+	for all columns above, use rec_prefix_field_no instead of
+	rec_field_no, and skip the clustered lookup below. */
+	for (ulint i = 0; i < prebuilt->n_template; i++) {
+		mysql_row_templ_t* templ = prebuilt->mysql_template + i;
+		templ->rec_field_no = templ->rec_prefix_field_no;
+		ut_a(templ->rec_field_no != ULINT_UNDEFINED);
+	}
+
+	return true;
+}
+
+/** Searches for rows in the database using cursor.
+Function is mainly used for tables that are shared across connections and
+so it employs technique that can help re-construct the rows that
+transaction is suppose to see.
+It also has optimization such as pre-caching the rows, using AHI, etc.
+
+@param[out]	buf		buffer for the fetched row in MySQL format
+@param[in]	mode		search mode PAGE_CUR_L
+@param[in,out]	prebuilt	prebuilt struct for the table handler;
+				this contains the info to search_tuple,
+				index; if search tuple contains 0 field then
+				we position the cursor at start or the end of
+				index, depending on 'mode'
+@param[in]	match_mode	0 or ROW_SEL_EXACT or ROW_SEL_EXACT_PREFIX
+@param[in]	direction	0 or ROW_SEL_NEXT or ROW_SEL_PREV;
+				Note: if this is != 0, then prebuilt must has a
+				pcur with stored position! In opening of a
+				cursor 'direction' should be 0.
+@return DB_SUCCESS or error code */
+dberr_t
+row_search_mvcc(
+	byte*		buf,
+	page_cur_mode_t	mode,
+	row_prebuilt_t*	prebuilt,
+	ulint		match_mode,
+	ulint		direction)
+{
+	DBUG_ENTER("row_search_mvcc");
+	DBUG_ASSERT(prebuilt->index->table == prebuilt->table);
+
+	dict_index_t*	index		= prebuilt->index;
+	ibool		comp		= dict_table_is_comp(prebuilt->table);
+	const dtuple_t*	search_tuple	= prebuilt->search_tuple;
+	btr_pcur_t*	pcur		= prebuilt->pcur;
+	trx_t*		trx		= prebuilt->trx;
+	dict_index_t*	clust_index;
+	que_thr_t*	thr;
+	const rec_t*	UNINIT_VAR(rec);
+	dtuple_t*	vrow = NULL;
+	const rec_t*	result_rec = NULL;
+	const rec_t*	clust_rec;
+	Row_sel_get_clust_rec_for_mysql row_sel_get_clust_rec_for_mysql;
+	ibool		unique_search			= FALSE;
+	ulint		mtr_extra_clust_savepoint	= 0;
+	bool		moves_up			= false;
+	/* if the returned record was locked and we did a semi-consistent
+	read (fetch the newest committed version), then this is set to
+	TRUE */
+	ulint		next_offs;
+	bool		same_user_rec;
+	ibool		table_lock_waited		= FALSE;
+	byte*		next_buf			= 0;
+	bool		spatial_search			= false;
+
+	ut_ad(index && pcur && search_tuple);
+	ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+	ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
+
+	/* We don't support FTS queries from the HANDLER interfaces, because
+	we implemented FTS as reversed inverted index with auxiliary tables.
+	So anything related to traditional index query would not apply to
+	it. */
+	if (prebuilt->index->type & DICT_FTS) {
+		DBUG_RETURN(DB_END_OF_INDEX);
+	}
+
+	if (!prebuilt->table->space) {
+		DBUG_RETURN(DB_TABLESPACE_DELETED);
+	} else if (!prebuilt->table->is_readable()) {
+		if (fil_space_crypt_t* crypt_data =
+		    prebuilt->table->space->crypt_data) {
+			if (crypt_data->should_encrypt()) {
+				DBUG_RETURN(DB_DECRYPTION_FAILED);
+			}
+		}
+		DBUG_RETURN(DB_CORRUPTION);
+	} else if (!prebuilt->index_usable) {
+		DBUG_RETURN(DB_MISSING_HISTORY);
+	} else if (prebuilt->index->is_corrupted()) {
+		DBUG_RETURN(DB_CORRUPTION);
+	}
+
+	pcur->btr_cur.page_cur.index = index;
+
+	/* We need to get the virtual column values stored in secondary
+	index key, if this is covered index scan or virtual key read is
+	requested. */
+	bool    need_vrow = prebuilt->read_just_key
+		&& prebuilt->index->has_virtual();
+
+	/* Reset the new record lock info if READ UNCOMMITTED or
+	READ COMMITED isolation level is used. Then
+	we are able to remove the record locks set here on an individual
+	row. */
+	prebuilt->new_rec_locks = 0;
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 1: Try to pop the row from the prefetch cache */
+
+	if (UNIV_UNLIKELY(direction == 0)) {
+		trx->op_info = "starting index read";
+
+		prebuilt->n_rows_fetched = 0;
+		prebuilt->n_fetch_cached = 0;
+		prebuilt->fetch_cache_first = 0;
+
+		if (prebuilt->sel_graph == NULL) {
+			/* Build a dummy select query graph */
+			row_prebuild_sel_graph(prebuilt);
+		}
+	} else {
+		trx->op_info = "fetching rows";
+
+		if (prebuilt->n_rows_fetched == 0) {
+			prebuilt->fetch_direction = direction;
+		}
+
+		if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) {
+			if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) {
+				ut_error;
+				/* TODO: scrollable cursor: restore cursor to
+				the place of the latest returned row,
+				or better: prevent caching for a scroll
+				cursor! */
+			}
+
+			prebuilt->n_rows_fetched = 0;
+			prebuilt->n_fetch_cached = 0;
+			prebuilt->fetch_cache_first = 0;
+
+		} else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) {
+			row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
+
+			prebuilt->n_rows_fetched++;
+			trx->op_info = "";
+			DBUG_RETURN(DB_SUCCESS);
+		}
+
+		if (prebuilt->fetch_cache_first > 0
+		    && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
+early_not_found:
+			/* The previous returned row was popped from the fetch
+			cache, but the cache was not full at the time of the
+			popping: no more rows can exist in the result set */
+			trx->op_info = "";
+			DBUG_RETURN(DB_RECORD_NOT_FOUND);
+		}
+
+		prebuilt->n_rows_fetched++;
+
+		if (prebuilt->n_rows_fetched > 1000000000) {
+			/* Prevent wrap-over */
+			prebuilt->n_rows_fetched = 500000000;
+		}
+
+		mode = pcur->search_mode;
+	}
+
+	/* In a search where at most one record in the index may match, we
+	can use a LOCK_REC_NOT_GAP type record lock when locking a
+	non-delete-marked matching record.
+
+	Note that in a unique secondary index there may be different
+	delete-marked versions of a record where only the primary key
+	values differ: thus in a secondary index we must use next-key
+	locks when locking delete-marked records. */
+
+	if (match_mode == ROW_SEL_EXACT
+	    && dict_index_is_unique(index)
+	    && dtuple_get_n_fields(search_tuple)
+	    == dict_index_get_n_unique(index)
+	    && (dict_index_is_clust(index)
+		|| !dtuple_contains_null(search_tuple))) {
+
+		/* Note above that a UNIQUE secondary index can contain many
+		rows with the same key value if one of the columns is the SQL
+		null. A clustered index under MySQL can never contain null
+		columns because we demand that all the columns in primary key
+		are non-null. */
+
+		unique_search = TRUE;
+
+		/* Even if the condition is unique, MySQL seems to try to
+		retrieve also a second row if a primary key contains more than
+		1 column. Return immediately if this is not a HANDLER
+		command. */
+
+		if (UNIV_UNLIKELY(direction != 0
+				  && !prebuilt->used_in_HANDLER)) {
+			goto early_not_found;
+		}
+	}
+
+	/* We don't support sequencial scan for Rtree index, because it
+	is no meaning to do so. */
+	if (dict_index_is_spatial(index) && !RTREE_SEARCH_MODE(mode)) {
+		trx->op_info = "";
+		DBUG_RETURN(DB_END_OF_INDEX);
+	}
+
+	/* if the query is a plain locking SELECT, and the isolation level
+	is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */
+	bool did_semi_consistent_read = false;
+	mtr_t mtr;
+	mtr.start();
+
+	mem_heap_t*	heap				= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets				= offsets_;
+	rec_offs_init(offsets_);
+
+#ifdef BTR_CUR_HASH_ADAPT
+	/*-------------------------------------------------------------*/
+	/* PHASE 2: Try fast adaptive hash index search if possible */
+
+	/* Next test if this is the special case where we can use the fast
+	adaptive hash index to try the search. Since we must release the
+	search system latch when we retrieve an externally stored field, we
+	cannot use the adaptive hash index in a search in the case the row
+	may be long and there may be externally stored fields */
+
+	if (UNIV_UNLIKELY(direction == 0)
+	    && unique_search
+	    && btr_search_enabled
+	    && dict_index_is_clust(index)
+	    && !index->table->is_temporary()
+	    && !prebuilt->templ_contains_blob
+	    && !prebuilt->used_in_HANDLER
+	    && (prebuilt->mysql_row_len < srv_page_size / 8)) {
+
+		mode = PAGE_CUR_GE;
+
+		if (prebuilt->select_lock_type == LOCK_NONE
+		    && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+		    && trx->read_view.is_open()) {
+
+			/* This is a SELECT query done as a consistent read,
+			and the read view has already been allocated:
+			let us try a search shortcut through the hash
+			index. */
+
+			dberr_t err = DB_SUCCESS;
+			switch (row_sel_try_search_shortcut_for_mysql(
+					&rec, prebuilt, &offsets, &heap,
+					&mtr)) {
+			case SEL_FOUND:
+				/* At this point, rec is protected by
+				a page latch that was acquired by
+				row_sel_try_search_shortcut_for_mysql().
+				The latch will not be released until
+				mtr.commit(). */
+				ut_ad(!rec_get_deleted_flag(rec, comp));
+
+				if (prebuilt->pk_filter || prebuilt->idx_cond) {
+					switch (row_search_idx_cond_check(
+							buf, prebuilt,
+							rec, offsets)) {
+					case CHECK_ABORTED_BY_USER:
+						goto aborted;
+					case CHECK_NEG:
+					case CHECK_OUT_OF_RANGE:
+                                        case CHECK_ERROR:
+						err = DB_RECORD_NOT_FOUND;
+						goto shortcut_done;
+					case CHECK_POS:
+						goto shortcut_done;
+					}
+
+					ut_ad("incorrect code" == 0);
+aborted:
+					err = DB_INTERRUPTED;
+					goto shortcut_done;
+				}
+
+				if (!row_sel_store_mysql_rec(
+					    buf, prebuilt,
+					    rec, NULL, false, index,
+					    offsets)) {
+					/* Only fresh inserts may contain
+					incomplete externally stored
+					columns. Pretend that such
+					records do not exist. Such
+					records may only be accessed
+					at the READ UNCOMMITTED
+					isolation level or when
+					rolling back a recovered
+					transaction. Rollback happens
+					at a lower level, not here. */
+
+					/* Proceed as in case SEL_RETRY. */
+					break;
+				}
+
+				goto shortcut_done;
+
+			case SEL_EXHAUSTED:
+				err = DB_RECORD_NOT_FOUND;
+			shortcut_done:
+				mtr.commit();
+
+				/* NOTE that we do NOT store the cursor
+				position */
+				trx->op_info = "";
+				ut_ad(!did_semi_consistent_read);
+				if (UNIV_LIKELY_NULL(heap)) {
+					mem_heap_free(heap);
+				}
+				DBUG_RETURN(err);
+
+			case SEL_RETRY:
+				break;
+
+			default:
+				ut_ad(0);
+			}
+
+			mtr.commit();
+			mtr.start();
+		}
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 3: Open or restore index cursor position */
+
+	spatial_search = dict_index_is_spatial(index)
+			 && mode >= PAGE_CUR_CONTAIN;
+
+#ifdef UNIV_DEBUG
+	/* The state of a running trx can only be changed by the
+	thread that is currently serving the transaction. Because we
+	are that thread, we can read trx->state without holding any
+	mutex. */
+	switch (trx->state) {
+	case TRX_STATE_ACTIVE:
+		break;
+	case TRX_STATE_NOT_STARTED:
+		ut_ad(prebuilt->sql_stat_start
+		      || prebuilt->table->no_rollback());
+		break;
+	default:
+		ut_ad("invalid trx->state" == 0);
+	}
+#endif
+
+	ut_ad(prebuilt->sql_stat_start
+	      || prebuilt->select_lock_type != LOCK_NONE
+	      || trx->read_view.is_open()
+	      || prebuilt->table->no_rollback()
+	      || srv_read_only_mode);
+
+	/* Do not lock gaps at READ UNCOMMITTED or READ COMMITTED
+	isolation level */
+	const bool set_also_gap_locks =
+		prebuilt->select_lock_type != LOCK_NONE
+		&& trx->isolation_level > TRX_ISO_READ_COMMITTED
+#ifdef WITH_WSREP
+		&& !wsrep_thd_skip_locking(trx->mysql_thd)
+#endif /* WITH_WSREP */
+		;
+
+	/* Note that if the search mode was GE or G, then the cursor
+	naturally moves upward (in fetch next) in alphabetical order,
+	otherwise downward */
+
+	if (UNIV_UNLIKELY(direction == 0)) {
+		if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G
+		    || mode >= PAGE_CUR_CONTAIN) {
+			moves_up = true;
+		}
+	} else if (direction == ROW_SEL_NEXT) {
+		moves_up = true;
+	}
+
+	thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+	clust_index = dict_table_get_first_index(prebuilt->table);
+
+	dberr_t err = DB_SUCCESS;
+
+	/* Do some start-of-statement preparations */
+
+	if (prebuilt->table->no_rollback()) {
+		/* NO_ROLLBACK tables do not support MVCC or locking. */
+		prebuilt->select_lock_type = LOCK_NONE;
+		prebuilt->sql_stat_start = FALSE;
+	} else if (!prebuilt->sql_stat_start) {
+		/* No need to set an intention lock or assign a read view */
+		ut_a(prebuilt->select_lock_type != LOCK_NONE
+		     || srv_read_only_mode || trx->read_view.is_open());
+	} else {
+		prebuilt->sql_stat_start = FALSE;
+		trx_start_if_not_started(trx, false);
+
+		if (prebuilt->select_lock_type == LOCK_NONE) {
+			trx->read_view.open(trx);
+		} else {
+wait_table_again:
+			err = lock_table(prebuilt->table, nullptr,
+					 prebuilt->select_lock_type == LOCK_S
+					 ? LOCK_IS : LOCK_IX, thr);
+
+			if (err != DB_SUCCESS) {
+
+				table_lock_waited = TRUE;
+				goto lock_table_wait;
+			}
+		}
+	}
+
+	/* Open or restore index cursor position */
+
+	if (UNIV_LIKELY(direction != 0)) {
+		if (spatial_search) {
+			/* R-Tree access does not need to do
+			cursor position and resposition */
+			goto next_rec;
+		}
+
+		bool	need_to_process = sel_restore_position_for_mysql(
+			&same_user_rec, BTR_SEARCH_LEAF,
+			pcur, moves_up, &mtr);
+
+		if (UNIV_UNLIKELY(need_to_process)) {
+			if (UNIV_UNLIKELY(!btr_pcur_get_rec(pcur))) {
+				mtr.commit();
+				trx->op_info = "";
+				if (UNIV_LIKELY_NULL(heap)) {
+					mem_heap_free(heap);
+				}
+				return DB_CORRUPTION;
+			}
+
+			if (UNIV_UNLIKELY(prebuilt->row_read_type
+					  == ROW_READ_DID_SEMI_CONSISTENT)) {
+				/* We did a semi-consistent read,
+				but the record was removed in
+				the meantime. */
+				prebuilt->row_read_type
+					= ROW_READ_TRY_SEMI_CONSISTENT;
+			}
+		} else if (UNIV_LIKELY(prebuilt->row_read_type
+				       != ROW_READ_DID_SEMI_CONSISTENT)) {
+
+			/* The cursor was positioned on the record
+			that we returned previously.  If we need
+			to repeat a semi-consistent read as a
+			pessimistic locking read, the record
+			cannot be skipped. */
+
+			goto next_rec_after_check;
+		}
+
+	} else if (dtuple_get_n_fields(search_tuple) > 0) {
+		pcur->btr_cur.thr = thr;
+		pcur->old_rec = nullptr;
+
+		if (index->is_spatial()) {
+			if (!prebuilt->rtr_info) {
+				prebuilt->rtr_info = rtr_create_rtr_info(
+					set_also_gap_locks, true,
+					btr_pcur_get_btr_cur(pcur), index);
+				prebuilt->rtr_info->search_tuple = search_tuple;
+				prebuilt->rtr_info->search_mode = mode;
+				rtr_info_update_btr(btr_pcur_get_btr_cur(pcur),
+						    prebuilt->rtr_info);
+			} else {
+				rtr_info_reinit_in_cursor(
+					btr_pcur_get_btr_cur(pcur),
+					index, set_also_gap_locks);
+				prebuilt->rtr_info->search_tuple = search_tuple;
+				prebuilt->rtr_info->search_mode = mode;
+			}
+
+			err = rtr_search_leaf(pcur, search_tuple, mode, &mtr);
+		} else {
+			err = btr_pcur_open_with_no_init(search_tuple, mode,
+							 BTR_SEARCH_LEAF,
+							 pcur, &mtr);
+		}
+
+		if (err != DB_SUCCESS) {
+page_corrupted:
+			rec = NULL;
+			goto page_read_error;
+		}
+
+		pcur->trx_if_known = trx;
+
+		rec = btr_pcur_get_rec(pcur);
+		ut_ad(page_rec_is_leaf(rec));
+
+		if (!moves_up
+		    && set_also_gap_locks
+		    && !page_rec_is_supremum(rec)
+		    && !dict_index_is_spatial(index)) {
+
+			/* Try to place a gap lock on the next index record
+			to prevent phantoms in ORDER BY ... DESC queries */
+			const rec_t*	next_rec = page_rec_get_next_const(rec);
+			if (UNIV_UNLIKELY(!next_rec)) {
+				err = DB_CORRUPTION;
+				goto page_corrupted;
+			}
+
+			offsets = rec_get_offsets(next_rec, index, offsets,
+						  index->n_core_fields,
+						  ULINT_UNDEFINED, &heap);
+			err = sel_set_rec_lock(pcur,
+					       next_rec, index, offsets,
+					       prebuilt->select_lock_type,
+					       LOCK_GAP, thr, &mtr);
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+				err = DB_SUCCESS;
+				/* fall through */
+			case DB_SUCCESS:
+				break;
+			default:
+				goto lock_wait_or_error;
+			}
+		}
+	} else if (mode == PAGE_CUR_G || mode == PAGE_CUR_L) {
+		err = pcur->open_leaf(mode == PAGE_CUR_G, index,
+				      BTR_SEARCH_LEAF, &mtr);
+
+		if (err != DB_SUCCESS) {
+			if (err == DB_DECRYPTION_FAILED) {
+				btr_decryption_failed(*index);
+			}
+			rec = NULL;
+			goto page_read_error;
+		}
+	}
+
+	/* Check if the table is supposed to be empty for our read view.
+
+	If we read bulk_trx_id as an older transaction ID, it is not
+	incorrect to check here whether that transaction should be
+	visible to us. If bulk_trx_id is not visible to us, the table
+	must have been empty at an earlier point of time, also in our
+	read view.
+
+	An INSERT would only update bulk_trx_id in
+	row_ins_clust_index_entry_low() if the table really was empty
+	(everything had been purged), when holding a leaf page latch
+	in the clustered index (actually, the root page is the only
+	leaf page in that case).
+
+	We are already holding a leaf page latch here, either
+	in a secondary index or in a clustered index.
+
+	If we are holding a clustered index page latch, there clearly
+	is no potential for race condition with a concurrent INSERT:
+	such INSERT would be blocked by us.
+
+	If we are holding a secondary index page latch, then we are
+	not directly blocking a concurrent INSERT that might update
+	bulk_trx_id to something that does not exist in our read view.
+	But, in that case, the entire table (all indexes) must have
+	been empty. So, even if our read below missed the update of
+	index->table->bulk_trx_id, we can safely proceed to reading
+	the empty secondary index page. Our latch will prevent the
+	INSERT from proceeding to that page. It will first modify
+	the clustered index. Also, we may only look up something in
+	the clustered index if the secondary index page is not empty
+	to begin with. So, only if the table is corrupted
+	(the clustered index is empty but the secondary index is not)
+	we could return corrupted results. */
+	if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED
+	    || !trx->read_view.is_open()) {
+	} else if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
+		/* InnoDB should allow the transaction to read all
+		the rows when InnoDB intends to do any locking
+		on the record */
+		if (prebuilt->select_lock_type == LOCK_NONE
+		    && !trx->read_view.changes_visible(bulk_trx_id)) {
+			trx->op_info = "";
+			err = DB_END_OF_INDEX;
+			goto normal_return;
+		}
+	}
+
+rec_loop:
+	DEBUG_SYNC_C("row_search_rec_loop");
+	if (trx_is_interrupted(trx)) {
+		if (!spatial_search) {
+			btr_pcur_store_position(pcur, &mtr);
+		}
+		err = DB_INTERRUPTED;
+		goto normal_return;
+	}
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 4: Look for matching records in a loop */
+
+	rec = btr_pcur_get_rec(pcur);
+
+	ut_ad(!!page_rec_is_comp(rec) == comp);
+	ut_ad(page_rec_is_leaf(rec));
+
+	if (page_rec_is_infimum(rec)) {
+
+		/* The infimum record on a page cannot be in the result set,
+		and neither can a record lock be placed on it: we skip such
+		a record. */
+
+		goto next_rec;
+	}
+
+	if (page_rec_is_supremum(rec)) {
+
+		if (set_also_gap_locks
+		    && !dict_index_is_spatial(index)) {
+
+			/* Try to place a lock on the index record */
+
+			/* If the transaction isolation level is
+			READ UNCOMMITTED or READ COMMITTED,
+			we do not lock gaps. Supremum record is really
+			a gap and therefore we do not set locks there. */
+
+			offsets = rec_get_offsets(rec, index, offsets,
+						  index->n_core_fields,
+						  ULINT_UNDEFINED, &heap);
+			err = sel_set_rec_lock(pcur,
+					       rec, index, offsets,
+					       prebuilt->select_lock_type,
+					       LOCK_ORDINARY, thr, &mtr);
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+				err = DB_SUCCESS;
+				/* fall through */
+			case DB_SUCCESS:
+				break;
+			default:
+				goto lock_wait_or_error;
+			}
+		}
+
+		/* A page supremum record cannot be in the result set: skip
+		it now that we have placed a possible lock on it */
+
+		goto next_rec;
+	}
+
+	/*-------------------------------------------------------------*/
+	/* Do sanity checks in case our cursor has bumped into page
+	corruption */
+
+	if (comp) {
+		if (rec_get_info_bits(rec, true) & REC_INFO_MIN_REC_FLAG) {
+			/* Skip the metadata pseudo-record. */
+			ut_ad(index->is_instant());
+			goto next_rec;
+		}
+
+		next_offs = rec_get_next_offs(rec, TRUE);
+		if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) {
+
+			goto wrong_offs;
+		}
+	} else {
+		if (rec_get_info_bits(rec, false) & REC_INFO_MIN_REC_FLAG) {
+			/* Skip the metadata pseudo-record. */
+			ut_ad(index->is_instant());
+			goto next_rec;
+		}
+
+		next_offs = rec_get_next_offs(rec, FALSE);
+		if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) {
+
+			goto wrong_offs;
+		}
+	}
+
+	if (UNIV_UNLIKELY(next_offs >= srv_page_size - PAGE_DIR)) {
+
+wrong_offs:
+		if (srv_force_recovery == 0 || moves_up == false) {
+			ib::error() << "Rec address "
+				<< static_cast<const void*>(rec)
+				<< ", buf block fix count "
+				<< btr_pcur_get_block(pcur)->page
+				.buf_fix_count();
+
+			ib::error() << "Index corruption: rec offs "
+				<< page_offset(rec) << " next offs "
+				<< next_offs
+				<< btr_pcur_get_block(pcur)->page.id()
+				<< ", index " << index->name
+				<< " of table " << index->table->name
+				<< ". Run CHECK TABLE. You may need to"
+				" restore from a backup, or dump + drop +"
+				" reimport the table.";
+			ut_ad(0);
+			err = DB_CORRUPTION;
+
+			goto page_read_error;
+		} else {
+			/* The user may be dumping a corrupt table. Jump
+			over the corruption to recover as much as possible. */
+
+			ib::info() << "Index corruption: rec offs "
+				<< page_offset(rec) << " next offs "
+				<< next_offs
+				<< btr_pcur_get_block(pcur)->page.id()
+				<< ", index " << index->name
+				<< " of table " << index->table->name
+				<< ". We try to skip the rest of the page.";
+
+			page_cur_set_after_last(btr_pcur_get_block(pcur),
+						btr_pcur_get_page_cur(pcur));
+			pcur->old_rec = nullptr;
+			goto next_rec;
+		}
+	}
+	/*-------------------------------------------------------------*/
+
+	/* Calculate the 'offsets' associated with 'rec' */
+
+	ut_ad(fil_page_index_page_check(btr_pcur_get_page(pcur)));
+	ut_ad(btr_page_get_index_id(btr_pcur_get_page(pcur)) == index->id);
+
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+
+	if (UNIV_UNLIKELY(srv_force_recovery > 0)) {
+		if (!rec_validate(rec, offsets)
+		    || !btr_index_rec_validate(rec, index, FALSE)) {
+
+			ib::error() << "Index corruption: rec offs "
+				<< page_offset(rec) << " next offs "
+				<< next_offs
+				<< btr_pcur_get_block(pcur)->page.id()
+				<< ", index " << index->name
+				<< " of table " << index->table->name
+				<< ". We try to skip the record.";
+
+			goto next_rec;
+		}
+	}
+
+	/* Note that we cannot trust the up_match value in the cursor at this
+	place because we can arrive here after moving the cursor! Thus
+	we have to recompare rec and search_tuple to determine if they
+	match enough. */
+
+	if (match_mode == ROW_SEL_EXACT) {
+		/* Test if the index record matches completely to search_tuple
+		in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
+
+		/* fputs("Comparing rec and search tuple\n", stderr); */
+
+		if (cmp_dtuple_rec(search_tuple, rec, index, offsets)) {
+
+			if (set_also_gap_locks
+			    && !dict_index_is_spatial(index)) {
+				err = sel_set_rec_lock(
+					pcur,
+					rec, index, offsets,
+					prebuilt->select_lock_type, LOCK_GAP,
+					thr, &mtr);
+
+				switch (err) {
+				case DB_SUCCESS_LOCKED_REC:
+				case DB_SUCCESS:
+					break;
+				default:
+					goto lock_wait_or_error;
+				}
+			}
+
+			btr_pcur_store_position(pcur, &mtr);
+
+			/* The found record was not a match, but may be used
+			as NEXT record (index_next). Set the relative position
+			to BTR_PCUR_BEFORE, to reflect that the position of
+			the persistent cursor is before the found/stored row
+			(pcur->old_rec). */
+			ut_ad(pcur->rel_pos == BTR_PCUR_ON);
+			pcur->rel_pos = BTR_PCUR_BEFORE;
+
+			err = DB_RECORD_NOT_FOUND;
+			goto normal_return;
+		}
+
+	} else if (match_mode == ROW_SEL_EXACT_PREFIX) {
+
+		if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec,
+						 index, offsets)) {
+
+			if (set_also_gap_locks
+			    && !dict_index_is_spatial(index)) {
+				err = sel_set_rec_lock(
+					pcur,
+					rec, index, offsets,
+					prebuilt->select_lock_type, LOCK_GAP,
+					thr, &mtr);
+
+				switch (err) {
+				case DB_SUCCESS_LOCKED_REC:
+				case DB_SUCCESS:
+					break;
+				default:
+					goto lock_wait_or_error;
+				}
+			}
+
+			btr_pcur_store_position(pcur, &mtr);
+
+			/* The found record was not a match, but may be used
+			as NEXT record (index_next). Set the relative position
+			to BTR_PCUR_BEFORE, to reflect that the position of
+			the persistent cursor is before the found/stored row
+			(pcur->old_rec). */
+			ut_ad(pcur->rel_pos == BTR_PCUR_ON);
+			pcur->rel_pos = BTR_PCUR_BEFORE;
+
+			err = DB_RECORD_NOT_FOUND;
+			goto normal_return;
+		}
+	}
+
+	/* We are ready to look at a possible new index entry in the result
+	set: the cursor is now placed on a user record */
+
+	if (prebuilt->select_lock_type != LOCK_NONE) {
+		/* Try to place a lock on the index record; note that delete
+		marked records are a special case in a unique search. If there
+		is a non-delete marked record, then it is enough to lock its
+		existence with LOCK_REC_NOT_GAP. */
+
+		unsigned lock_type;
+
+		if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+			/* At READ COMMITTED or READ UNCOMMITTED
+			isolation levels, do not lock committed
+			delete-marked records. */
+			if (!rec_get_deleted_flag(rec, comp)) {
+				goto no_gap_lock;
+			}
+
+			/* At most one transaction can be active
+			for temporary table. */
+			if (clust_index->table->is_temporary()) {
+				goto no_gap_lock;
+			}
+
+			if (index == clust_index) {
+				trx_id_t trx_id = row_get_rec_trx_id(
+					rec, index, offsets);
+				/* In delete-marked records, DB_TRX_ID must
+				always refer to an existing undo log record. */
+				ut_ad(trx_id);
+				if (!trx_sys.is_registered(trx, trx_id)) {
+					/* The clustered index record
+					was delete-marked in a committed
+					transaction. Ignore the record. */
+					goto locks_ok_del_marked;
+				}
+			} else if (trx_t* t = row_vers_impl_x_locked(
+					   trx, rec, index, offsets)) {
+				/* The record belongs to an active
+				transaction. We must acquire a lock. */
+				t->release_reference();
+			} else {
+				/* The secondary index record does not
+				point to a delete-marked clustered index
+				record that belongs to an active transaction.
+				Ignore the secondary index record, because
+				it is not locked. */
+				goto next_rec;
+			}
+
+			goto no_gap_lock;
+		}
+
+#ifdef WITH_WSREP
+		if (UNIV_UNLIKELY(!set_also_gap_locks)) {
+			ut_ad(wsrep_thd_skip_locking(trx->mysql_thd));
+			goto no_gap_lock;
+		}
+#else /* WITH_WSREP */
+		ut_ad(set_also_gap_locks);
+#endif /* WITH_WSREP */
+
+		/* Set next-key lock both for delete- and non-delete-marked
+		records for unique search, because non-delete-marked record can
+		be marked as deleted while transaction suspends. */
+		if (index->is_spatial()) {
+			goto no_gap_lock;
+		}
+
+		/* If we are doing a 'greater or equal than a primary key
+		value' search from a clustered index, and we find a record
+		that has that exact primary key value, then there is no need
+		to lock the gap before the record, because no insert in the
+		gap can be in our search range. That is, no phantom row can
+		appear that way.
+
+		An example: if col1 is the primary key, the search is WHERE
+		col1 >= 100, and we find a record where col1 = 100, then no
+		need to lock the gap before that record. */
+
+		if (index == clust_index
+		    && mode == PAGE_CUR_GE
+		    && direction == 0
+		    && dtuple_get_n_fields_cmp(search_tuple)
+		    == dict_index_get_n_unique(index)
+		    && !cmp_dtuple_rec(search_tuple, rec, index, offsets)) {
+no_gap_lock:
+			lock_type = LOCK_REC_NOT_GAP;
+		} else {
+			lock_type = LOCK_ORDINARY;
+		}
+
+		err = sel_set_rec_lock(pcur,
+				       rec, index, offsets,
+				       prebuilt->select_lock_type,
+				       lock_type, thr, &mtr);
+
+		switch (err) {
+			const rec_t*	old_vers;
+		case DB_SUCCESS_LOCKED_REC:
+			if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+				/* Note that a record of
+				prebuilt->index was locked. */
+				prebuilt->new_rec_locks = 1;
+			}
+			err = DB_SUCCESS;
+			/* fall through */
+		case DB_SUCCESS:
+			break;
+		case DB_LOCK_WAIT:
+			/* Lock wait for R-tree should already
+			be handled in sel_set_rtr_rec_lock() */
+			ut_ad(!dict_index_is_spatial(index));
+			/* Never unlock rows that were part of a conflict. */
+			prebuilt->new_rec_locks = 0;
+
+			if (UNIV_LIKELY(prebuilt->row_read_type
+					!= ROW_READ_TRY_SEMI_CONSISTENT)
+			    || unique_search
+			    || index != clust_index) {
+				if (!prebuilt->skip_locked) {
+					goto lock_wait_or_error;
+				}
+			} else {
+				/* The following call returns 'offsets'
+				associated with 'old_vers' */
+				row_sel_build_committed_vers_for_mysql(
+					clust_index, prebuilt, rec,
+					&offsets, &heap, &old_vers,
+					need_vrow ? &vrow : NULL, &mtr);
+			}
+
+			/* Check whether it was a deadlock or not, if not
+			a deadlock and the transaction had to wait then
+			release the lock it is waiting on. */
+
+			err = lock_trx_handle_wait(trx);
+
+			switch (err) {
+			case DB_SUCCESS:
+				ut_ad(
+				    !trx->lock.was_chosen_as_deadlock_victim);
+				/* The lock was granted while we were
+				searching for the last committed version.
+				Do a normal locking read. */
+
+				offsets = rec_get_offsets(
+					rec, index, offsets,
+					index->n_core_fields,
+					ULINT_UNDEFINED, &heap);
+				goto locks_ok;
+			case DB_DEADLOCK:
+				goto lock_wait_or_error;
+			case DB_LOCK_WAIT:
+				ut_ad(!dict_index_is_spatial(index));
+				err = DB_SUCCESS;
+				if (prebuilt->skip_locked) {
+					goto next_rec;
+				}
+				break;
+		        case DB_LOCK_WAIT_TIMEOUT:
+				if (prebuilt->skip_locked) {
+					err = DB_SUCCESS;
+					goto next_rec;
+				}
+				/* fall through */
+			default:
+				ut_error;
+			}
+
+			if (old_vers == NULL) {
+				/* The row was not yet committed */
+
+				goto next_rec;
+			}
+
+			did_semi_consistent_read = true;
+			rec = old_vers;
+			break;
+		case DB_RECORD_NOT_FOUND:
+			if (dict_index_is_spatial(index)) {
+				goto next_rec;
+			} else {
+				goto lock_wait_or_error;
+			}
+			break;
+		case DB_LOCK_WAIT_TIMEOUT:
+			if (prebuilt->skip_locked) {
+				err = DB_SUCCESS;
+				goto next_rec;
+			}
+			/* fall through */
+		default:
+
+			goto lock_wait_or_error;
+		}
+	} else {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED
+		    || prebuilt->table->is_temporary()
+		    || prebuilt->table->no_rollback()) {
+
+			/* Do nothing: we let a non-locking SELECT read the
+			latest version of the record */
+
+		} else if (index == clust_index) {
+
+			/* Fetch a previous version of the row if the current
+			one is not visible in the snapshot; if we have a very
+			high force recovery level set, we try to avoid crashes
+			by skipping this lookup */
+
+			err = row_sel_clust_sees(rec, *index, offsets,
+						 trx->read_view);
+
+			switch (err) {
+			default:
+				goto lock_wait_or_error;
+			case DB_SUCCESS:
+				break;
+			case DB_SUCCESS_LOCKED_REC:
+				ut_ad(srv_force_recovery
+				      < SRV_FORCE_NO_UNDO_LOG_SCAN);
+				rec_t*	old_vers;
+				/* The following call returns 'offsets'
+				associated with 'old_vers' */
+				err = row_sel_build_prev_vers_for_mysql(
+					prebuilt, clust_index,
+					rec, &offsets, &heap, &old_vers,
+					need_vrow ? &vrow : nullptr, &mtr);
+
+				if (err != DB_SUCCESS) {
+
+					goto lock_wait_or_error;
+				}
+
+				if (old_vers == NULL) {
+					/* The row did not exist yet in
+					the read view */
+
+					goto next_rec;
+				}
+
+				rec = old_vers;
+			}
+		} else {
+			/* We are looking into a non-clustered index,
+			and to get the right version of the record we
+			have to look also into the clustered index: this
+			is necessary, because we can only get the undo
+			information via the clustered index record. */
+
+			ut_ad(!dict_index_is_clust(index));
+
+			if (!srv_read_only_mode) {
+				trx_id_t trx_id = page_get_max_trx_id(
+					page_align(rec));
+				ut_ad(trx_id);
+				if (trx->read_view.sees(trx_id)) {
+					goto locks_ok;
+				}
+				/* We should look at the clustered index.
+				However, as this is a non-locking read,
+				we can skip the clustered index lookup if
+				the condition does not match the secondary
+				index entry. */
+				switch (row_search_idx_cond_check(
+						buf, prebuilt, rec, offsets)) {
+				case CHECK_NEG:
+					goto next_rec;
+                                case CHECK_ABORTED_BY_USER:
+					err = DB_INTERRUPTED;
+					goto idx_cond_failed;
+				case CHECK_OUT_OF_RANGE:
+				case CHECK_ERROR:
+					err = DB_RECORD_NOT_FOUND;
+					goto idx_cond_failed;
+				case CHECK_POS:
+					goto requires_clust_rec;
+				}
+
+				ut_error;
+			}
+		}
+	}
+
+locks_ok:
+	/* NOTE that at this point rec can be an old version of a clustered
+	index record built for a consistent read. We cannot assume after this
+	point that rec is on a buffer pool page. Functions like
+	page_rec_is_comp() cannot be used! */
+
+	if (rec_get_deleted_flag(rec, comp)) {
+locks_ok_del_marked:
+		/* In delete-marked records, DB_TRX_ID must
+		always refer to an existing undo log record. */
+		ut_ad(index != clust_index
+		      || row_get_rec_trx_id(rec, index, offsets));
+
+		/* The record is delete-marked: we can skip it */
+
+		/* This is an optimization to skip setting the next key lock
+		on the record that follows this delete-marked record. This
+		optimization works because of the unique search criteria
+		which precludes the presence of a range lock between this
+		delete marked record and the record following it.
+
+		For now this is applicable only to clustered indexes while
+		doing a unique search except for HANDLER queries because
+		HANDLER allows NEXT and PREV even in unique search on
+		clustered index. There is scope for further optimization
+		applicable to unique secondary indexes. Current behaviour is
+		to widen the scope of a lock on an already delete marked record
+		if the same record is deleted twice by the same transaction */
+		if (index == clust_index && unique_search
+		    && !prebuilt->used_in_HANDLER) {
+
+			err = DB_RECORD_NOT_FOUND;
+
+			goto normal_return;
+		}
+
+		goto next_rec;
+	}
+
+	/* Check if the record matches the index condition. */
+	switch (row_search_idx_cond_check(buf, prebuilt, rec, offsets)) {
+	case CHECK_NEG:
+		if (did_semi_consistent_read) {
+			row_unlock_for_mysql(prebuilt, TRUE);
+		}
+		goto next_rec;
+        case CHECK_ABORTED_BY_USER:
+		err = DB_INTERRUPTED;
+		goto idx_cond_failed;
+	case CHECK_OUT_OF_RANGE:
+        case CHECK_ERROR:
+		err = DB_RECORD_NOT_FOUND;
+		goto idx_cond_failed;
+	case CHECK_POS:
+		break;
+	}
+
+	if (index != clust_index && prebuilt->need_to_access_clustered) {
+		if (row_search_with_covering_prefix(prebuilt, rec, offsets)) {
+			goto use_covering_index;
+		}
+requires_clust_rec:
+		ut_ad(index != clust_index);
+		/* We use a 'goto' to the preceding label if a consistent
+		read of a secondary index record requires us to look up old
+		versions of the associated clustered index record. */
+
+		ut_ad(rec_offs_validate(rec, index, offsets));
+
+		/* It was a non-clustered index and we must fetch also the
+		clustered index record */
+
+		mtr_extra_clust_savepoint = mtr.get_savepoint();
+
+		ut_ad(!vrow);
+		/* The following call returns 'offsets' associated with
+		'clust_rec'. Note that 'clust_rec' can be an old version
+		built for a consistent read. */
+
+		err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
+						      thr, &clust_rec,
+						      &offsets, &heap,
+						      need_vrow ? &vrow : NULL,
+						      &mtr);
+		if (err == DB_LOCK_WAIT && prebuilt->skip_locked) {
+			err = lock_trx_handle_wait(trx);
+		}
+		switch (err) {
+		case DB_SUCCESS:
+			if (clust_rec == NULL) {
+				/* The record did not exist in the read view */
+				ut_ad(prebuilt->select_lock_type == LOCK_NONE
+				      || dict_index_is_spatial(index));
+				goto next_rec;
+			}
+			break;
+		case DB_SUCCESS_LOCKED_REC:
+			ut_a(clust_rec != NULL);
+			if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+				/* Note that the clustered index record
+				was locked. */
+				prebuilt->new_rec_locks = 2;
+			}
+			err = DB_SUCCESS;
+			break;
+		case DB_LOCK_WAIT_TIMEOUT:
+		case DB_LOCK_WAIT:
+			if (prebuilt->skip_locked) {
+				err = DB_SUCCESS;
+				goto next_rec;
+			}
+			/* fall through */
+		default:
+			vrow = NULL;
+			goto lock_wait_or_error;
+		}
+
+		if (rec_get_deleted_flag(clust_rec, comp)) {
+
+			/* The record is delete marked: we can skip it */
+
+			if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+			    && prebuilt->select_lock_type != LOCK_NONE) {
+
+				/* No need to keep a lock on a delete-marked
+				record if we do not want to use next-key
+				locking. */
+
+				row_unlock_for_mysql(prebuilt, TRUE);
+			}
+
+			goto next_rec;
+		}
+
+		if (need_vrow && !vrow) {
+			if (!heap) {
+				heap = mem_heap_create(100);
+			}
+			row_sel_fill_vrow(rec, index, &vrow, heap);
+		}
+
+		result_rec = clust_rec;
+		ut_ad(rec_offs_validate(result_rec, clust_index, offsets));
+
+		if (prebuilt->pk_filter || prebuilt->idx_cond) {
+			/* Convert the record to MySQL format. We were
+			unable to do this in row_search_idx_cond_check(),
+			because the condition is on the secondary index
+			and the requested column is in the clustered index.
+			We convert all fields, including those that
+			may have been used in ICP, because the
+			secondary index may contain a column prefix
+			rather than the full column. Also, as noted
+			in Bug #56680, the column in the secondary
+			index may be in the wrong case, and the
+			authoritative case is in result_rec, the
+			appropriate version of the clustered index record. */
+			if (!row_sel_store_mysql_rec(
+				    buf, prebuilt, result_rec, vrow,
+				    true, clust_index, offsets)) {
+				goto next_rec;
+			}
+		}
+	} else {
+use_covering_index:
+		result_rec = rec;
+	}
+
+	/* We found a qualifying record 'result_rec'. At this point,
+	'offsets' are associated with 'result_rec'. */
+
+	ut_ad(rec_offs_validate(result_rec,
+				result_rec != rec ? clust_index : index,
+				offsets));
+	ut_ad(!rec_get_deleted_flag(result_rec, comp));
+
+	/* Decide whether to prefetch extra rows.
+	At this point, the clustered index record is protected
+	by a page latch that was acquired when pcur was positioned.
+	The latch will not be released until mtr.commit(). */
+
+	if ((match_mode == ROW_SEL_EXACT
+	     || prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD)
+	    && prebuilt->select_lock_type == LOCK_NONE
+	    && !prebuilt->templ_contains_blob
+	    && !prebuilt->clust_index_was_generated
+	    && !prebuilt->used_in_HANDLER
+	    && !prebuilt->in_fts_query) {
+		/* Inside an update, for example, we do not cache rows,
+		since we may use the cursor position to do the actual
+		update, that is why we require ...lock_type == LOCK_NONE.
+		Since we keep space in prebuilt only for the BLOBs of
+		a single row, we cannot cache rows in the case there
+		are BLOBs in the fields to be fetched. In HANDLER we do
+		not cache rows because there the cursor is a scrollable
+		cursor. */
+
+		ut_a(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
+
+		/* We only convert from InnoDB row format to MySQL row
+		format when ICP is disabled. */
+
+		if (!prebuilt->pk_filter && !prebuilt->idx_cond) {
+			/* We use next_buf to track the allocation of buffers
+			where we store and enqueue the buffers for our
+			pre-fetch optimisation.
+
+			If next_buf == 0 then we store the converted record
+			directly into the MySQL record buffer (buf). If it is
+			!= 0 then we allocate a pre-fetch buffer and store the
+			converted record there.
+
+			If the conversion fails and the MySQL record buffer
+			was not written to then we reset next_buf so that
+			we can re-use the MySQL record buffer in the next
+			iteration. */
+
+			next_buf = next_buf
+				 ? row_sel_fetch_last_buf(prebuilt) : buf;
+
+			if (!row_sel_store_mysql_rec(
+				next_buf, prebuilt, result_rec, vrow,
+				result_rec != rec,
+				result_rec != rec ? clust_index : index,
+				offsets)) {
+
+				if (next_buf == buf) {
+					ut_a(prebuilt->n_fetch_cached == 0);
+					next_buf = 0;
+				}
+
+				/* Only fresh inserts may contain incomplete
+				externally stored columns. Pretend that such
+				records do not exist. Such records may only be
+				accessed at the READ UNCOMMITTED isolation
+				level or when rolling back a recovered
+				transaction. Rollback happens at a lower
+				level, not here. */
+				goto next_rec;
+			}
+
+			if (next_buf != buf) {
+				row_sel_enqueue_cache_row_for_mysql(
+					next_buf, prebuilt);
+			}
+		} else {
+			row_sel_enqueue_cache_row_for_mysql(buf, prebuilt);
+		}
+
+		if (prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE) {
+			goto next_rec;
+		}
+	} else {
+		if (!prebuilt->pk_filter && !prebuilt->idx_cond) {
+			/* The record was not yet converted to MySQL format. */
+			if (!row_sel_store_mysql_rec(
+				    buf, prebuilt, result_rec, vrow,
+				    result_rec != rec,
+				    result_rec != rec ? clust_index : index,
+				    offsets)) {
+				/* Only fresh inserts may contain
+				incomplete externally stored
+				columns. Pretend that such records do
+				not exist. Such records may only be
+				accessed at the READ UNCOMMITTED
+				isolation level or when rolling back a
+				recovered transaction. Rollback
+				happens at a lower level, not here. */
+				goto next_rec;
+			}
+		}
+
+		if (!prebuilt->clust_index_was_generated) {
+		} else if (result_rec != rec || index->is_primary()) {
+			memcpy(prebuilt->row_id, result_rec, DATA_ROW_ID_LEN);
+		} else {
+			ulint len;
+			const byte* data = rec_get_nth_field(
+				result_rec, offsets, index->n_fields - 1,
+				&len);
+			ut_ad(dict_index_get_nth_col(index,
+						     index->n_fields - 1)
+			      ->prtype == (DATA_ROW_ID | DATA_NOT_NULL));
+			ut_ad(len == DATA_ROW_ID_LEN);
+			memcpy(prebuilt->row_id, data, DATA_ROW_ID_LEN);
+		}
+	}
+
+	/* From this point on, 'offsets' are invalid. */
+
+	/* We have an optimization to save CPU time: if this is a consistent
+	read on a unique condition on the clustered index, then we do not
+	store the pcur position, because any fetch next or prev will anyway
+	return 'end of file'. Exceptions are locking reads and the MySQL
+	HANDLER command where the user can move the cursor with PREV or NEXT
+	even after a unique search. */
+
+	err = DB_SUCCESS;
+
+idx_cond_failed:
+	if (!unique_search
+	    || !dict_index_is_clust(index)
+	    || direction != 0
+	    || prebuilt->select_lock_type != LOCK_NONE
+	    || prebuilt->used_in_HANDLER) {
+
+		/* Inside an update always store the cursor position */
+
+		if (!spatial_search) {
+			btr_pcur_store_position(pcur, &mtr);
+		}
+	}
+
+	goto normal_return;
+
+next_rec:
+	/* Reset the old and new "did semi-consistent read" flags. */
+	if (UNIV_UNLIKELY(prebuilt->row_read_type
+			  == ROW_READ_DID_SEMI_CONSISTENT)) {
+		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+	}
+next_rec_after_check:
+	did_semi_consistent_read = false;
+	prebuilt->new_rec_locks = 0;
+	vrow = NULL;
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 5: Move the cursor to the next index record */
+
+	/* NOTE: For moves_up==FALSE, the mini-transaction will be
+	committed and restarted every time when switching b-tree
+	pages. For moves_up==TRUE in index condition pushdown, we can
+	scan an entire secondary index tree within a single
+	mini-transaction. As long as the prebuilt->idx_cond does not
+	match, we do not need to consult the clustered index or
+	return records to MySQL, and thus we can avoid repositioning
+	the cursor. What prevents us from buffer-fixing all leaf pages
+	within the mini-transaction is the btr_leaf_page_release()
+	call in btr_pcur_move_to_next_page(). Only the leaf page where
+	the cursor is positioned will remain buffer-fixed.
+	For R-tree spatial search, we also commit the mini-transaction
+	each time  */
+
+	if (spatial_search) {
+		/* No need to do store restore for R-tree */
+		mtr.rollback_to_savepoint(0);
+	} else if (mtr_extra_clust_savepoint) {
+		/* We must release any clustered index latches
+		if we are moving to the next non-clustered
+		index record, because we could break the latching
+		order if we would access a different clustered
+		index page right away without releasing the previous. */
+		mtr.rollback_to_savepoint(mtr_extra_clust_savepoint);
+	}
+
+	mtr_extra_clust_savepoint = 0;
+
+	if (moves_up) {
+		if (UNIV_UNLIKELY(spatial_search)) {
+			if (rtr_pcur_move_to_next(
+				    search_tuple, mode, pcur, 0, &mtr)) {
+				goto rec_loop;
+			}
+		} else {
+			/* This is based on btr_pcur_move_to_next() */
+			ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+			ut_ad(pcur->latch_mode != BTR_NO_LATCHES);
+			pcur->old_rec = nullptr;
+			if (btr_pcur_is_after_last_on_page(pcur)) {
+				if (btr_pcur_is_after_last_in_tree(pcur)) {
+					goto not_moved;
+				}
+				err = btr_pcur_move_to_next_page(pcur, &mtr);
+				if (err != DB_SUCCESS) {
+					goto lock_wait_or_error;
+				}
+			} else if (!btr_pcur_move_to_next_on_page(pcur)) {
+				goto corrupted;
+			}
+
+			goto rec_loop;
+		}
+	} else {
+		if (btr_pcur_move_to_prev(pcur, &mtr)) {
+			goto rec_loop;
+		}
+		if (UNIV_UNLIKELY(!btr_pcur_get_rec(pcur))) {
+corrupted:
+			err = DB_CORRUPTION;
+			goto normal_return;
+		}
+	}
+
+not_moved:
+	if (!spatial_search) {
+		btr_pcur_store_position(pcur, &mtr);
+	}
+
+	err = match_mode ? DB_RECORD_NOT_FOUND : DB_END_OF_INDEX;
+	goto normal_return;
+
+lock_wait_or_error:
+	if (!dict_index_is_spatial(index)) {
+		btr_pcur_store_position(pcur, &mtr);
+	}
+page_read_error:
+	/* Reset the old and new "did semi-consistent read" flags. */
+	if (UNIV_UNLIKELY(prebuilt->row_read_type
+			  == ROW_READ_DID_SEMI_CONSISTENT)) {
+		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+	}
+	did_semi_consistent_read = false;
+
+lock_table_wait:
+	mtr.commit();
+	mtr_extra_clust_savepoint = 0;
+
+	trx->error_state = err;
+	thr->lock_state = QUE_THR_LOCK_ROW;
+
+	if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
+		/* It was a lock wait, and it ended */
+
+		thr->lock_state = QUE_THR_LOCK_NOLOCK;
+		mtr.start();
+
+		/* Table lock waited, go try to obtain table lock
+		again */
+		if (table_lock_waited) {
+			table_lock_waited = FALSE;
+
+			goto wait_table_again;
+		}
+
+		if (!dict_index_is_spatial(index)) {
+			sel_restore_position_for_mysql(
+				&same_user_rec, BTR_SEARCH_LEAF, pcur,
+				moves_up, &mtr);
+		}
+
+		if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+		    && !same_user_rec) {
+
+			/* Since we were not able to restore the cursor
+			on the same user record, we cannot use
+			row_unlock_for_mysql() to unlock any records, and
+			we must thus reset the new rec lock info. Since
+			in lock0lock.cc we have blocked the inheriting of gap
+			X-locks, we actually do not have any new record locks
+			set in this case.
+
+			Note that if we were able to restore on the 'same'
+			user record, it is still possible that we were actually
+			waiting on a delete-marked record, and meanwhile
+			it was removed by purge and inserted again by some
+			other user. But that is no problem, because in
+			rec_loop we will again try to set a lock, and
+			new_rec_lock_info in trx will be right at the end. */
+
+			prebuilt->new_rec_locks = 0;
+		}
+
+		mode = pcur->search_mode;
+
+		goto rec_loop;
+	}
+
+	thr->lock_state = QUE_THR_LOCK_NOLOCK;
+
+	goto func_exit;
+
+normal_return:
+	mtr.commit();
+
+	DEBUG_SYNC_C("row_search_for_mysql_before_return");
+
+	if (prebuilt->pk_filter || prebuilt->idx_cond) {
+		/* When ICP is active we don't write to the MySQL buffer
+		directly, only to buffers that are enqueued in the pre-fetch
+		queue. We need to dequeue the first buffer and copy the contents
+		to the record buffer that was passed in by MySQL. */
+
+		if (prebuilt->n_fetch_cached > 0) {
+			row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
+			err = DB_SUCCESS;
+		}
+
+	} else if (next_buf != 0) {
+
+		/* We may or may not have enqueued some buffers to the
+		pre-fetch queue, but we definitely wrote to the record
+		buffer passed to use by MySQL. */
+
+		DEBUG_SYNC_C("row_search_cached_row");
+		err = DB_SUCCESS;
+	}
+
+#ifdef UNIV_DEBUG
+	if (dict_index_is_spatial(index) && err != DB_SUCCESS
+	    && err != DB_END_OF_INDEX && err != DB_INTERRUPTED) {
+		rtr_node_path_t*	path = pcur->btr_cur.rtr_info->path;
+
+		ut_ad(path->empty());
+	}
+#endif
+
+func_exit:
+	trx->op_info = "";
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	/* Set or reset the "did semi-consistent read" flag on return.
+	The flag did_semi_consistent_read is set if and only if
+	the record being returned was fetched with a semi-consistent read. */
+	ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS
+	      || !did_semi_consistent_read);
+
+	if (prebuilt->row_read_type != ROW_READ_WITH_LOCKS) {
+		if (did_semi_consistent_read) {
+			prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT;
+		} else {
+			prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+		}
+	}
+
+	DEBUG_SYNC_C("innodb_row_search_for_mysql_exit");
+
+	DBUG_RETURN(err);
+}
+
+/********************************************************************//**
+Count rows in a R-Tree leaf level.
+@return DB_SUCCESS if successful */
+dberr_t
+row_count_rtree_recs(
+/*=================*/
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct for the
+					table handle; this contains the info
+					of search_tuple, index; if search
+					tuple contains 0 fields then we
+					position the cursor at the start or
+					the end of the index, depending on
+					'mode' */
+	ulint*		n_rows)		/*!< out: number of entries
+					seen in the consistent read */
+{
+	dict_index_t*	index		= prebuilt->index;
+	dberr_t		ret		= DB_SUCCESS;
+	mtr_t		mtr;
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dtuple_t*	search_entry	= prebuilt->search_tuple;
+	ulint		entry_len;
+	ulint		i;
+	byte*		buf;
+
+	ut_a(dict_index_is_spatial(index));
+
+	*n_rows = 0;
+
+	heap = mem_heap_create(256);
+
+	/* Build a search tuple. */
+	entry_len = dict_index_get_n_fields(index);
+	entry = dtuple_create(heap, entry_len);
+
+	for (i = 0; i < entry_len; i++) {
+		const dict_field_t*	ind_field
+			= dict_index_get_nth_field(index, i);
+		const dict_col_t*	col
+			= ind_field->col;
+		dfield_t*		dfield
+			= dtuple_get_nth_field(entry, i);
+
+		if (i == 0) {
+			double*	mbr;
+			double	tmp_mbr[SPDIMS * 2];
+
+			dfield->type.mtype = DATA_GEOMETRY;
+			dfield->type.prtype |= DATA_GIS_MBR;
+
+			/* Allocate memory for mbr field */
+			mbr = static_cast<double*>
+				(mem_heap_alloc(heap, DATA_MBR_LEN));
+
+			/* Set mbr field data. */
+			dfield_set_data(dfield, mbr, DATA_MBR_LEN);
+
+			for (uint j = 0; j < SPDIMS; j++) {
+				tmp_mbr[j * 2] = DBL_MAX;
+				tmp_mbr[j * 2 + 1] = -DBL_MAX;
+			}
+			dfield_write_mbr(dfield, tmp_mbr);
+			continue;
+		}
+
+		dfield->type.mtype = col->mtype;
+		dfield->type.prtype = col->prtype;
+
+	}
+
+	prebuilt->search_tuple = entry;
+
+	ulint bufsize = std::max<ulint>(srv_page_size,
+					prebuilt->mysql_row_len);
+	buf = static_cast<byte*>(ut_malloc_nokey(bufsize));
+
+	ulint direction = 0;
+
+loop:
+	ret = row_search_mvcc(buf, PAGE_CUR_WITHIN, prebuilt, 0, direction);
+	direction = ROW_SEL_NEXT;
+
+	switch (ret) {
+	case DB_SUCCESS:
+		break;
+	case DB_DEADLOCK:
+	case DB_LOCK_TABLE_FULL:
+	case DB_LOCK_WAIT_TIMEOUT:
+	case DB_INTERRUPTED:
+		goto func_exit;
+	default:
+		/* fall through (this error is ignored by CHECK TABLE) */
+	case DB_END_OF_INDEX:
+		ret = DB_SUCCESS;
+func_exit:
+		prebuilt->search_tuple = search_entry;
+		ut_free(buf);
+		mem_heap_free(heap);
+
+		return(ret);
+	}
+
+	++*n_rows;
+	goto loop;
+}
+
+/** Check if a version of a clustered index record and a secondary
+index record match.
+
+@param prebuilt       index and transaction
+@param clust_rec      a version of a clustered index record
+@param clust_index    clustered index
+@param clust_offsets  rec_get_offsets(clust_rec, clust_index)
+@param rec            secondary index leaf page record
+@param offsets        rec_get_offsets(rec, index)
+@return an error code
+@retval DB_SUCCESS             if rec matches clust_rec
+@retval DB_SUCCESS_LOCKED_REC  if rec does not match clust_rec
+*/
+static dberr_t row_check_index_match(row_prebuilt_t *prebuilt,
+                                     const rec_t *clust_rec,
+                                     const dict_index_t *clust_index,
+                                     const rec_offs *clust_offsets,
+                                     const rec_t *rec,
+                                     const dict_index_t *index,
+                                     const rec_offs *offsets)
+{
+  ut_ad(index == prebuilt->index);
+
+  ib_vcol_row vc(index->has_virtual() ? mem_heap_create(256) : nullptr);
+
+  const uint16_t n= index->n_user_defined_cols;
+
+  for (uint16_t i= 0; i < n; i++)
+  {
+    ulint pos= 0;
+    ulint len, sec_len;
+
+    const dict_field_t &ifield= index->fields[i];
+    const byte *sec_field= rec_get_nth_field(rec, offsets, i, &sec_len);
+    const byte *field;
+
+    if (ifield.col->is_virtual())
+    {
+      /* Virtual column values must be reconstructed from the base columns. */
+      row_ext_t *ext;
+      byte *record= vc.record(prebuilt->trx->mysql_thd, clust_index,
+                              &prebuilt->m_mysql_table);
+      const dict_v_col_t *v_col= reinterpret_cast<const dict_v_col_t*>
+        (ifield.col);
+      dtuple_t *row= row_build(ROW_COPY_POINTERS,
+                               clust_index, clust_rec, clust_offsets,
+                               nullptr, nullptr, nullptr, &ext, vc.heap);
+      if (dfield_t *vfield=
+          innobase_get_computed_value(row, v_col, clust_index, &vc.heap,
+                                      nullptr, nullptr,
+                                      prebuilt->trx->mysql_thd,
+                                      prebuilt->m_mysql_table,
+                                      record, nullptr, nullptr))
+      {
+        len= vfield->len;
+        field= static_cast<byte*>(vfield->data);
+      }
+      else
+      {
+        innobase_report_computed_value_failed(row);
+        return DB_COMPUTE_VALUE_FAILED;
+      }
+    }
+    else
+    {
+      pos= dict_col_get_clust_pos(ifield.col, clust_index);
+      field= rec_get_nth_cfield(clust_rec, clust_index, clust_offsets, pos,
+                                &len);
+      if (len == UNIV_SQL_NULL)
+      {
+        if (sec_len == UNIV_SQL_NULL)
+          continue;
+        return DB_SUCCESS_LOCKED_REC;
+      }
+      if (sec_len == UNIV_SQL_NULL)
+        return DB_SUCCESS_LOCKED_REC;
+
+      if (rec_offs_nth_extern(clust_offsets, pos))
+      {
+        if (len == BTR_EXTERN_FIELD_REF_SIZE)
+          goto compare_blobs;
+        len-= BTR_EXTERN_FIELD_REF_SIZE;
+      }
+
+      if (ifield.prefix_len)
+      {
+        len=
+          dtype_get_at_most_n_mbchars(ifield.col->prtype, ifield.col->mbminlen,
+                                      ifield.col->mbmaxlen,
+                                      ifield.prefix_len, len,
+                                      reinterpret_cast<const char*>(field));
+        if (len < sec_len)
+          goto check_for_blob;
+      }
+      else
+      {
+check_for_blob:
+        if (rec_offs_nth_extern(clust_offsets, pos))
+        {
+compare_blobs:
+          if (!row_sel_sec_rec_is_for_blob(ifield.col->mtype,
+                                           ifield.col->prtype,
+                                           ifield.col->mbminlen,
+                                           ifield.col->mbmaxlen,
+                                           field, len, sec_field, sec_len,
+                                           ifield.prefix_len,
+                                           clust_index->table))
+            return DB_SUCCESS_LOCKED_REC;
+          continue;
+        }
+      }
+    }
+
+    if (cmp_data(ifield.col->mtype, ifield.col->prtype, false,
+                 field, len, sec_field, sec_len))
+      return DB_SUCCESS_LOCKED_REC;
+  }
+
+  return DB_SUCCESS;
+}
+
+/**
+Check the index records in CHECK TABLE.
+The index must contain entries in an ascending order,
+unique constraint must not be violated by duplicated keys,
+and the number of index entries is counted in according to the
+current read view.
+
+@param prebuilt    index and transaction
+@param n_rows      number of records counted
+
+@return error code
+@retval DB_SUCCESS  if no error was found */
+dberr_t row_check_index(row_prebuilt_t *prebuilt, ulint *n_rows)
+{
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs_init(offsets_);
+
+  *n_rows= 0;
+  dict_index_t *const index= prebuilt->index;
+
+  if (!index->is_btree())
+    return DB_CORRUPTION;
+
+  mem_heap_t *heap= mem_heap_create(100);
+
+  dtuple_t *prev_entry= nullptr;
+  mtr_t mtr;
+  mtr.start();
+
+  dict_index_t *clust_index= dict_table_get_first_index(prebuilt->table);
+  prebuilt->clust_pcur->btr_cur.page_cur.index = clust_index;
+  dberr_t err= prebuilt->pcur->open_leaf(true, index, BTR_SEARCH_LEAF, &mtr);
+  if (UNIV_UNLIKELY(err != DB_SUCCESS))
+  {
+func_exit:
+    mtr.commit();
+    mem_heap_free(heap);
+    return err;
+  }
+
+  if (const trx_id_t bulk_trx_id= index->table->bulk_trx_id)
+    if (!prebuilt->trx->read_view.changes_visible(bulk_trx_id))
+      goto func_exit;
+
+  ReadView check_table_extended_view;
+  ReadView &view=
+    prebuilt->need_to_access_clustered &&
+    !prebuilt->table->is_temporary() &&
+    prebuilt->trx->isolation_level != TRX_ISO_READ_UNCOMMITTED
+    ? check_table_extended_view : prebuilt->trx->read_view;
+  if (&view == &check_table_extended_view)
+    check_table_extended_view.set_creator_trx_id(prebuilt->trx->id);
+
+page_loop:
+  if (&view == &check_table_extended_view)
+    /* In CHECK TABLE...EXTENDED, we make a copy of purge_sys.end_view
+    while holding a shared latch on the index leaf page.
+    Should a currently active purge batch desire to remove any further
+    records from this page, it would be blocked by our page latch.
+
+    We will consult check_table_extended_view to determine if a
+    clustered index record corresponding to a secondary index record
+    is visible to the current purge batch. Right after we have made our
+    copy, purge_sys.end_view is free to be changed again.
+
+    If we have an orphan secondary index record, we may attempt to
+    request a clustered index record version that cannot be retrieved
+    any more because the undo log records may have been freed
+    (according to the purge_sys.end_view). In such a case,
+    trx_undo_get_undo_rec() would cause
+    trx_undo_prev_version_build() and trx_undo_prev_version_build()
+    to return DB_MISSING_HISTORY. */
+    static_cast<ReadViewBase&>(check_table_extended_view)=
+      purge_sys_t::end_view_guard{}.view();
+
+rec_loop:
+  ut_ad(err == DB_SUCCESS);
+
+  if (!btr_pcur_move_to_next_on_page(prebuilt->pcur))
+  {
+    err= DB_CORRUPTION;
+    goto func_exit;
+  }
+
+  const rec_t *rec= btr_pcur_get_rec(prebuilt->pcur);
+  rec_offs *offsets= offsets_;
+
+  if (page_rec_is_supremum(rec))
+  {
+  next_page:
+    if (btr_pcur_is_after_last_in_tree(prebuilt->pcur))
+      goto func_exit;
+    err= btr_pcur_move_to_next_page(prebuilt->pcur, &mtr);
+    if (err == DB_SUCCESS && trx_is_interrupted(prebuilt->trx))
+      err= DB_INTERRUPTED;
+    if (UNIV_UNLIKELY(err != DB_SUCCESS))
+      goto func_exit;
+    goto page_loop;
+  }
+
+  offsets= rec_get_offsets(rec, index, offsets, index->n_core_fields,
+                           ULINT_UNDEFINED, &heap);
+
+  const auto info_bits=
+    rec_get_info_bits(rec, prebuilt->table->not_redundant());
+  const bool rec_deleted= info_bits & REC_INFO_DELETED_FLAG;
+
+  if (UNIV_UNLIKELY(info_bits & REC_INFO_MIN_REC_FLAG))
+  {
+    if (*n_rows || !index->is_instant())
+    {
+      push_warning_printf(prebuilt->trx->mysql_thd,
+                          Sql_condition::WARN_LEVEL_WARN, ER_NOT_KEYFILE,
+                          "InnoDB: invalid record encountered");
+      prebuilt->autoinc_error= DB_INDEX_CORRUPT;
+    }
+    goto next_rec;
+  }
+
+  if (prebuilt->table->is_temporary())
+  {
+  count_or_not:
+    if (rec_deleted)
+      goto next_rec;
+  }
+  else if (index->is_clust())
+  {
+    if (prebuilt->trx->isolation_level == TRX_ISO_READ_UNCOMMITTED)
+      goto count_or_not;
+
+    trx_id_t rec_trx_id= row_get_rec_trx_id(rec, index, offsets);
+
+    if (rec_trx_id >= prebuilt->trx->read_view.low_limit_id() &&
+        UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id()))
+    {
+    invalid_trx_id:
+      if (prebuilt->autoinc_error == DB_SUCCESS)
+        push_warning_printf(prebuilt->trx->mysql_thd,
+                            Sql_condition::WARN_LEVEL_WARN,
+                            ER_NOT_KEYFILE,
+                            "InnoDB: DB_TRX_ID=" TRX_ID_FMT
+                            " exceeds the system-wide maximum",
+                            rec_trx_id);
+      prebuilt->autoinc_error= DB_CORRUPTION;
+      goto next_rec;
+    }
+
+    if (!prebuilt->trx->read_view.changes_visible(rec_trx_id))
+    {
+      ut_ad(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN);
+      rec_t *old_vers;
+      /* The following call returns 'offsets' associated with 'old_vers' */
+      err= row_sel_build_prev_vers_for_mysql(prebuilt, index, rec, &offsets,
+                                             &heap, &old_vers, nullptr, &mtr);
+
+      if (err != DB_SUCCESS)
+        goto func_exit;
+
+      if (old_vers)
+      {
+        rec= old_vers;
+        rec_trx_id= row_get_rec_trx_id(rec, index, offsets);
+
+        if (rec_trx_id >= prebuilt->trx->read_view.low_limit_id() &&
+            UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id()))
+          goto invalid_trx_id;
+
+        if (!rec_get_deleted_flag(rec, prebuilt->table->not_redundant()))
+          goto count_row;
+      }
+      else
+        offsets= rec_get_offsets(rec, index, offsets, index->n_core_fields,
+                                 ULINT_UNDEFINED, &heap);
+      goto next_rec;
+    }
+    else if (!rec_deleted && !rec_trx_id);
+    else if (!check_table_extended_view.changes_visible(rec_trx_id));
+    else if (prebuilt->autoinc_error == DB_SUCCESS)
+    {
+      const char *msg= rec_deleted
+        ? "Unpurged clustered index record"
+        : "Clustered index record with stale history";
+
+      ib::warn w;
+      w << msg << " in table " << index->table->name << ": "
+        << rec_offsets_print(rec, offsets);
+      prebuilt->autoinc_error= DB_MISSING_HISTORY;
+      push_warning_printf(prebuilt->trx->mysql_thd,
+                          Sql_condition::WARN_LEVEL_WARN,
+                          ER_NOT_KEYFILE, "InnoDB: %s", w.m_oss.str().c_str());
+    }
+
+    goto count_or_not;
+  }
+  else if (const trx_id_t page_trx_id= page_get_max_trx_id(page_align(rec)))
+  {
+    if (page_trx_id >= trx_sys.get_max_trx_id())
+      goto invalid_PAGE_MAX_TRX_ID;
+    if (prebuilt->trx->isolation_level == TRX_ISO_READ_UNCOMMITTED);
+    else if (&view == &check_table_extended_view || rec_deleted ||
+             !view.sees(page_trx_id))
+    {
+      bool got_extended_match= &view == &check_table_extended_view;
+      const auto savepoint= mtr.get_savepoint();
+
+      row_build_row_ref_in_tuple(prebuilt->clust_ref, rec, index, offsets);
+      err= btr_pcur_open_with_no_init(prebuilt->clust_ref,
+                                      PAGE_CUR_LE, BTR_SEARCH_LEAF,
+                                      prebuilt->clust_pcur, &mtr);
+      if (err != DB_SUCCESS)
+        goto func_exit;
+
+      const rec_t *clust_rec= btr_pcur_get_rec(prebuilt->clust_pcur);
+
+      /* Note: only if the search ends up on a non-infimum record is the
+      low_match value the real match to the search tuple */
+
+      if (!page_rec_is_user_rec(clust_rec) ||
+          btr_pcur_get_low_match(prebuilt->clust_pcur) < clust_index->n_uniq)
+      {
+        if (!rec_deleted)
+        {
+        not_found:
+          /* MDEV-29823 FIXME: There is a race condition between
+          rollback, purge, and possibly other SQL connections that
+          are creating and releasing read views. At the time
+          row_undo_mod_del_mark_or_remove_sec_low() is executing
+          rollback on a secondary index record, purge_sys.view
+          may not allow it to delete the record, and it will be
+          delete-marked. Eventually purge_sys.view would advance,
+          but the delete-marked record could never be removed,
+          because no undo log record was ever added to
+          the purge queue by trx_purge_add_undo_to_history().
+
+          For now, we will not flag an error about orphan secondary index
+          records that are delete-marked; we will only warn about them. */
+
+          if (!rec_deleted || prebuilt->autoinc_error == DB_SUCCESS)
+          {
+            ib::error_or_warn w(!rec_deleted);
+            w << "Clustered index record not found for index "
+              << index->name << " of table " << index->table->name
+              << ": " << rec_offsets_print(rec, offsets);
+            push_warning_printf(prebuilt->trx->mysql_thd,
+                                Sql_condition::WARN_LEVEL_WARN,
+                                ER_NOT_KEYFILE, "InnoDB: %s",
+                                w.m_oss.str().c_str());
+          }
+
+          if (prebuilt->autoinc_error == DB_SUCCESS)
+            prebuilt->autoinc_error= rec_deleted
+              ? DB_MISSING_HISTORY
+              : DB_CORRUPTION;
+        }
+        else if (&view == &check_table_extended_view)
+        extended_not_found:
+          if (view.changes_visible(page_trx_id))
+            goto not_found;
+      did_not_find:
+        mtr.rollback_to_savepoint(savepoint);
+        goto next_rec;
+      }
+
+      rec_offs *clust_offsets;
+      trx_id_t rec_trx_id;
+      rec_t *old_vers= nullptr;
+
+      bool found_in_view= false;
+      trx_id_t visible_trx_id= ~0ULL;
+
+      if (ulint trx_id_offset= clust_index->trx_id_offset)
+      {
+        clust_offsets= nullptr;
+      read_trx_id:
+        rec_trx_id= trx_read_trx_id(clust_rec + trx_id_offset);
+
+        if (clust_rec[trx_id_offset + DATA_TRX_ID_LEN] & 0x80)
+        {
+          if (UNIV_UNLIKELY
+              (rec_get_deleted_flag(clust_rec,
+                                    prebuilt->table->not_redundant())))
+          {
+            err= DB_CORRUPTION;
+            goto func_exit;
+          }
+
+          /* This is the oldest available record version (fresh insert). */
+          if (!view.changes_visible(rec_trx_id))
+          {
+            if (rec_trx_id >= view.low_limit_id() &&
+                UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id()))
+              goto invalid_rec_trx_id;
+            if (got_extended_match)
+              goto check_latest_version;
+            goto did_not_find;
+          }
+        }
+      }
+      else
+      {
+        clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr,
+                                       clust_index->n_core_fields,
+                                       ULINT_UNDEFINED, &heap);
+        ulint trx_id_pos= clust_index->n_uniq ? clust_index->n_uniq : 1;
+        ulint len;
+        trx_id_offset= rec_get_nth_field_offs(clust_offsets, trx_id_pos, &len);
+        ut_ad(len == DATA_TRX_ID_LEN);
+        goto read_trx_id;
+      }
+
+      if (got_extended_match)
+      {
+      check_latest_version:
+        /* In CHECK TABLE...EXTENDED, always check if the secondary
+        index record matches the latest clustered index record
+        version, no matter if it is visible in our own read view.
+
+        If the latest clustered index version is delete-marked and
+        purgeable, it is not safe to fetch any BLOBs for column prefix
+        indexes because they may already have been freed. */
+        if (rec_trx_id &&
+            rec_get_deleted_flag(clust_rec,
+                                 prebuilt->table->not_redundant()) &&
+            purge_sys.is_purgeable(rec_trx_id))
+          goto did_not_find;
+
+        if (!clust_offsets)
+          clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr,
+                                         clust_index->n_core_fields,
+                                         ULINT_UNDEFINED, &heap);
+        err= row_check_index_match(prebuilt,
+                                   clust_rec, clust_index, clust_offsets,
+                                   rec, index, offsets);
+
+        switch (err) {
+        default:
+          goto func_exit;
+        case DB_SUCCESS_LOCKED_REC:
+        case DB_SUCCESS:
+          break;
+        }
+
+        got_extended_match= err == DB_SUCCESS;
+        err= DB_SUCCESS;
+
+        if (!prebuilt->trx->read_view.changes_visible(rec_trx_id))
+          /* While CHECK TABLE ... EXTENDED checks for a matching
+          clustered index record version for each secondary index
+          record, it must count only those records that belong to its
+          own read view.
+
+          If the latest version of clust_rec matches rec but is not
+          in our read view, there may still be an older version of
+          clust_rec that not only matches rec but is in our view.
+          We must evaluate old versions before deciding whether rec
+          should be counted. */
+          goto check_old_vers;
+
+        /* Remember that this is the visible clust_rec for rec,
+        and whether it matches rec. */
+        visible_trx_id= rec_trx_id;
+        found_in_view= got_extended_match &&
+          !rec_get_deleted_flag(clust_rec,
+                                prebuilt->table->not_redundant());
+
+        if (!got_extended_match)
+          goto check_old_vers;
+
+        if (!found_in_view)
+          goto did_not_find;
+
+      found_match:
+        mtr.rollback_to_savepoint(savepoint);
+        goto count_row;
+      }
+      else if (!view.changes_visible(rec_trx_id))
+      {
+      check_old_vers:
+        if (rec_trx_id >= view.low_limit_id() &&
+            UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id()))
+        {
+        invalid_rec_trx_id:
+          if (prebuilt->autoinc_error == DB_SUCCESS)
+            push_warning_printf(prebuilt->trx->mysql_thd,
+                                Sql_condition::WARN_LEVEL_WARN,
+                                ER_NOT_KEYFILE,
+                                "InnoDB: DB_TRX_ID=" TRX_ID_FMT
+                                " exceeds the system-wide maximum",
+                                rec_trx_id);
+          goto not_found;
+        }
+
+        if (!clust_offsets)
+          clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr,
+                                         clust_index->n_core_fields,
+                                         ULINT_UNDEFINED, &heap);
+
+        row_sel_reset_old_vers_heap(prebuilt);
+        /* The following is adapted from row_vers_build_for_consistent_read()
+        because when using check_table_extended_view, we must
+        consider every available version of the clustered index record. */
+        mem_heap_t *vers_heap= nullptr;
+
+        for (;;)
+        {
+          mem_heap_t *prev_heap= vers_heap;
+          vers_heap= mem_heap_create(1024);
+          err= trx_undo_prev_version_build(clust_rec,
+                                           clust_index, clust_offsets,
+                                           vers_heap, &old_vers,
+                                           nullptr, nullptr, 0);
+          if (prev_heap)
+            mem_heap_free(prev_heap);
+          if (err != DB_SUCCESS)
+          {
+          old_vers_err:
+            mem_heap_free(vers_heap);
+            if (err == DB_MISSING_HISTORY)
+            {
+              err= DB_SUCCESS;
+              if (got_extended_match)
+                goto did_not_find;
+              goto not_found;
+            }
+            goto func_exit;
+          }
+
+          if (UNIV_UNLIKELY(!old_vers))
+          {
+            mem_heap_free(vers_heap);
+            /* We did not find a matching clustered index record version
+            for the secondary index record. Normal CHECK TABLE will simply
+            not count the secondary index record; CHECK TABLE ... EXTENDED
+            will flag such orphan records if appropriate.
+
+            A secondary index record may may be "temporarily orphan"
+            if purge is in progress. We will only flag them if
+            everything up to PAGE_MAX_TRX_ID has been fully purged.
+
+            "Temporary orphans" may be produced when
+            row_undo_mod_clust() resets the DB_TRX_ID of the latest
+            clust_rec version or when trx_undo_prev_version_build()
+            encounters a BLOB that may have been freed according to
+            purge_sys.view (not purge_sys.end_view). */
+            if (&view == &check_table_extended_view && !got_extended_match)
+              goto extended_not_found;
+            goto did_not_find;
+          }
+
+          clust_rec= old_vers;
+          clust_offsets= rec_get_offsets(clust_rec, clust_index, clust_offsets,
+                                         clust_index->n_core_fields,
+                                         ULINT_UNDEFINED, &heap);
+
+          rec_trx_id= row_get_rec_trx_id(clust_rec, clust_index,
+                                         clust_offsets);
+
+          if (UNIV_UNLIKELY(rec_trx_id >=
+                            prebuilt->trx->read_view.low_limit_id() &&
+                            rec_trx_id >= trx_sys.get_max_trx_id()))
+          {
+            mem_heap_free(vers_heap);
+            goto invalid_rec_trx_id;
+          }
+
+          const bool rec_visible=
+            prebuilt->trx->read_view.changes_visible(rec_trx_id);
+          const bool clust_rec_deleted=
+            rec_get_deleted_flag(clust_rec, prebuilt->table->not_redundant());
+
+          if (&view != &prebuilt->trx->read_view)
+          {
+            /* It is not safe to fetch BLOBs of committed delete-marked
+            records that may have been freed in purge. */
+            err= clust_rec_deleted && rec_trx_id &&
+              purge_sys.is_purgeable(rec_trx_id)
+              ? DB_SUCCESS_LOCKED_REC
+              : row_check_index_match(prebuilt,
+                                      clust_rec, clust_index, clust_offsets,
+                                      rec, index, offsets);
+
+            switch (err) {
+            default:
+              goto old_vers_err;
+            case DB_SUCCESS_LOCKED_REC:
+              if (rec_visible && !~visible_trx_id)
+                visible_trx_id= rec_trx_id;
+              continue;
+            case DB_SUCCESS:
+              got_extended_match= true;
+              if (!rec_visible)
+                continue;
+              if (!~visible_trx_id)
+              {
+                visible_trx_id= rec_trx_id;
+                found_in_view= !clust_rec_deleted;
+              }
+              mem_heap_free(vers_heap);
+              if (!found_in_view)
+                goto did_not_find;
+              goto found_match;
+            }
+          }
+          else if (rec_visible)
+          {
+            if (!clust_rec_deleted)
+            {
+              clust_rec= rec_copy(mem_heap_alloc(heap,
+                                                 rec_offs_size(clust_offsets)),
+                                  clust_rec, clust_offsets);
+              rec_offs_make_valid(clust_rec, clust_index, true, clust_offsets);
+            }
+            mem_heap_free(vers_heap);
+            if (clust_rec_deleted)
+              goto did_not_find;
+            goto check_match;
+          }
+        }
+      }
+      else if (rec_get_deleted_flag(clust_rec,
+                                    prebuilt->table->not_redundant()))
+        goto did_not_find;
+
+      ut_ad(clust_rec);
+      ut_ad(&view != &check_table_extended_view);
+
+      /* If we had to go to an earlier version of row or the secondary
+      index record is delete marked, then it may be that the secondary
+      index record corresponding to clust_rec (or old_vers) is not
+      rec; in that case we must ignore such row because in our
+      snapshot rec would not have existed. Remember that from rec we
+      cannot see directly which transaction id corresponds to it: we
+      have to go to the clustered index record. A query where we want
+      to fetch all rows where the secondary index value is in some
+      interval would return a wrong result if we would not drop rows
+      which we come to visit through secondary index records that
+      would not really exist in our snapshot. */
+
+      if (rec_deleted)
+      {
+        if (!clust_offsets)
+          clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr,
+                                         clust_index->n_core_fields,
+                                         ULINT_UNDEFINED, &heap);
+      check_match:
+        /* This clustered index record version exists in
+        prebuilt->trx->read_view and is not delete-marked.
+        By design, any BLOBs in it are not allowed to be
+        freed in the purge of committed transaction history. */
+        err= row_check_index_match(prebuilt, clust_rec, clust_index,
+                                   clust_offsets, rec, index, offsets);
+        switch (err) {
+        case DB_SUCCESS:
+          break;
+        case DB_SUCCESS_LOCKED_REC:
+          err= DB_SUCCESS;
+          goto did_not_find;
+        default:
+          goto func_exit;
+        }
+      }
+
+      mtr.rollback_to_savepoint(savepoint);
+    }
+  }
+  else
+  {
+  invalid_PAGE_MAX_TRX_ID:
+    if (UNIV_LIKELY(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN))
+    {
+      push_warning_printf(prebuilt->trx->mysql_thd,
+                          Sql_condition::WARN_LEVEL_WARN, ER_NOT_KEYFILE,
+                          "InnoDB: Invalid PAGE_MAX_TRX_ID=%llu"
+                          " in index '%-.200s'",
+                          page_trx_id, index->name());
+      prebuilt->autoinc_error= DB_INDEX_CORRUPT;
+    }
+    goto next_rec;
+  }
+
+count_row:
+  ++*n_rows;
+
+  if (prev_entry)
+  {
+    ulint matched_fields= 0;
+    int cmp= cmp_dtuple_rec_with_match(prev_entry, rec, index, offsets,
+                                       &matched_fields);
+    const char* msg;
+
+    if (UNIV_LIKELY(cmp < 0));
+    else if (cmp > 0)
+    {
+      prebuilt->autoinc_error= DB_INDEX_CORRUPT;
+      msg= "index records in a wrong order in ";
+not_ok:
+      ib::error() << msg << index->name << " of table " << index->table->name
+                  << ": " << *prev_entry << ", "
+                  << rec_offsets_print(rec, offsets);
+    }
+    else if (index->is_unique() && matched_fields >=
+             dict_index_get_n_ordering_defined_by_user(index))
+    {
+      /* NULL values in unique indexes are considered not to be duplicates */
+      for (ulint i= 0; i < dict_index_get_n_ordering_defined_by_user(index);
+           i++)
+        if (dfield_is_null(dtuple_get_nth_field(prev_entry, i)))
+          goto next_rec;
+
+      if (prebuilt->autoinc_error == DB_SUCCESS)
+        prebuilt->autoinc_error= DB_DUPLICATE_KEY;
+      msg= "duplicate key in ";
+      goto not_ok;
+    }
+  }
+
+next_rec:
+  ut_ad(err == DB_SUCCESS);
+
+  {
+    mem_heap_t *tmp_heap= nullptr;
+
+    /* Empty the heap on each round.  But preserve offsets[]
+    for the row_rec_to_index_entry() call, by copying them
+    into a separate memory heap when needed. */
+    if (UNIV_UNLIKELY(offsets != offsets_))
+    {
+      ulint size= rec_offs_get_n_alloc(offsets) * sizeof *offsets;
+      tmp_heap= mem_heap_create(size);
+      offsets= static_cast<rec_offs*>(mem_heap_dup(tmp_heap, offsets, size));
+    }
+
+    mem_heap_empty(heap);
+    prev_entry= row_rec_to_index_entry(rec, index, offsets, heap);
+
+    if (UNIV_LIKELY_NULL(tmp_heap))
+      mem_heap_free(tmp_heap);
+  }
+
+  if (btr_pcur_is_after_last_on_page(prebuilt->pcur))
+    goto next_page;
+
+  goto rec_loop;
+}
+
+/*******************************************************************//**
+Read the AUTOINC column from the current row. If the value is less than
+0 and the type is not unsigned then we reset the value to 0.
+@return value read from the column */
+static
+ib_uint64_t
+row_search_autoinc_read_column(
+/*===========================*/
+	dict_index_t*	index,		/*!< in: index to read from */
+	const rec_t*	rec,		/*!< in: current rec */
+	ulint		col_no,		/*!< in: column number */
+	ulint		mtype,		/*!< in: column main type */
+	ibool		unsigned_type)	/*!< in: signed or unsigned flag */
+{
+	ulint		len;
+	const byte*	data;
+	ib_uint64_t	value;
+	mem_heap_t*	heap = NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets	= offsets_;
+
+	rec_offs_init(offsets_);
+	ut_ad(page_rec_is_leaf(rec));
+
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  col_no + 1, &heap);
+
+	if (rec_offs_nth_sql_null(offsets, col_no)) {
+		/* There is no non-NULL value in the auto-increment column. */
+		value = 0;
+		goto func_exit;
+	}
+
+	data = rec_get_nth_field(rec, offsets, col_no, &len);
+
+	value = row_parse_int(data, len, mtype, unsigned_type);
+
+func_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return(value);
+}
+
+/** Get the maximum and non-delete-marked record in an index.
+@param[in]	index	index tree
+@param[in,out]	mtr	mini-transaction (may be committed and restarted)
+@return maximum record, page s-latched in mtr
+@retval NULL if there are no records, or if all of them are delete-marked */
+static
+const rec_t*
+row_search_get_max_rec(
+	dict_index_t*	index,
+	mtr_t*		mtr)
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	const bool	desc	= index->fields[0].descending;
+
+	if (pcur.open_leaf(desc, index, BTR_SEARCH_LEAF, mtr) != DB_SUCCESS) {
+		return nullptr;
+	}
+
+	if (desc) {
+		const bool comp = index->table->not_redundant();
+		while (btr_pcur_move_to_next_user_rec(&pcur, mtr)) {
+			rec = btr_pcur_get_rec(&pcur);
+			if (rec_is_metadata(rec, *index)) {
+				continue;
+			}
+			if (!rec_get_deleted_flag(rec, comp)) {
+				goto found;
+			}
+		}
+	} else {
+		do {
+			rec = page_find_rec_last_not_deleted(
+				btr_pcur_get_page(&pcur));
+			if (page_rec_is_user_rec(rec)) {
+				goto found;
+			}
+			btr_pcur_move_before_first_on_page(&pcur);
+		} while (btr_pcur_move_to_prev(&pcur, mtr));
+	}
+
+	rec = nullptr;
+
+found:
+	ut_ad(!rec
+	      || !(rec_get_info_bits(rec, dict_table_is_comp(index->table))
+		   & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)));
+	return(rec);
+}
+
+/** Read the max AUTOINC value from an index.
+@param[in] index	index starting with an AUTO_INCREMENT column
+@return	the largest AUTO_INCREMENT value
+@retval	0	if no records were found */
+ib_uint64_t
+row_search_max_autoinc(dict_index_t* index)
+{
+	const dict_field_t*	dfield = dict_index_get_nth_field(index, 0);
+
+	ib_uint64_t	value = 0;
+
+	mtr_t		mtr;
+	mtr.start();
+
+	if (const rec_t* rec = row_search_get_max_rec(index, &mtr)) {
+		value = row_search_autoinc_read_column(
+			index, rec, 0,
+			dfield->col->mtype,
+			dfield->col->prtype & DATA_UNSIGNED);
+	}
+
+	mtr.commit();
+	return(value);
+}
diff --git a/storage/innobase/row/row0uins.cc b/storage/innobase/row/row0uins.cc
new file mode 100644
index 00000000..23255cc9
--- /dev/null
+++ b/storage/innobase/row/row0uins.cc
@@ -0,0 +1,652 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0uins.cc
+Fresh insert undo
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0uins.h"
+#include "dict0dict.h"
+#include "dict0stats.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "ibuf0ibuf.h"
+#include "log0log.h"
+#include "fil0fil.h"
+#include <mysql/service_thd_mdl.h>
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***************************************************************//**
+Removes a clustered index record. The pcur in node was positioned on the
+record, now it is detached.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static  MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_ins_remove_clust_rec(
+/*==========================*/
+	undo_node_t*	node)	/*!< in: undo node */
+{
+	dberr_t		err;
+	ulint		n_tries	= 0;
+	mtr_t		mtr;
+	dict_index_t*	index	= node->pcur.index();
+	table_id_t table_id = 0;
+	const bool dict_locked = node->trx->dict_operation_lock_mode;
+restart:
+	MDL_ticket* mdl_ticket = nullptr;
+	ut_ad(!table_id || dict_locked
+	      || !node->trx->dict_operation_lock_mode);
+	dict_table_t *table = table_id
+		? dict_table_open_on_id(table_id, dict_locked,
+					DICT_TABLE_OP_OPEN_ONLY_IF_CACHED,
+					node->trx->mysql_thd, &mdl_ticket)
+		: nullptr;
+
+	ut_ad(index->is_primary());
+	ut_ad(node->trx->in_rollback);
+
+	mtr.start();
+	if (index->table->is_temporary()) {
+		ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+		ut_ad(index->table->id >= DICT_HDR_FIRST_ID);
+	} else {
+		index->set_modified(mtr);
+		ut_ad(lock_table_has_locks(index->table));
+	}
+
+	/* This is similar to row_undo_mod_clust(). The DDL thread may
+	already have copied this row from the log to the new table.
+	We must log the removal, so that the row will be correctly
+	purged. However, we can log the removal out of sync with the
+	B-tree modification. */
+	ut_a(node->pcur.restore_position(
+	      (node->rec_type == TRX_UNDO_INSERT_METADATA)
+		? BTR_MODIFY_TREE
+		: BTR_MODIFY_LEAF,
+	      &mtr) == btr_pcur_t::SAME_ALL);
+	rec_t* rec = btr_pcur_get_rec(&node->pcur);
+
+	ut_ad(rec_get_trx_id(rec, index) == node->trx->id
+	      || node->table->is_temporary());
+	ut_ad(!rec_get_deleted_flag(rec, index->table->not_redundant())
+	      || rec_is_alter_metadata(rec, index->table->not_redundant()));
+	ut_ad(rec_is_metadata(rec, index->table->not_redundant())
+	      == (node->rec_type == TRX_UNDO_INSERT_METADATA));
+
+	switch (node->table->id) {
+	case DICT_COLUMNS_ID:
+		/* This is rolling back an INSERT into SYS_COLUMNS.
+		If it was part of an instant ALTER TABLE operation, we
+		must evict the table definition, so that it can be
+		reloaded after the dictionary operation has been
+		completed. At this point, any corresponding operation
+		to the metadata record will have been rolled back. */
+		ut_ad(node->trx->dict_operation_lock_mode);
+		ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
+		if (rec_get_n_fields_old(rec)
+		    != DICT_NUM_FIELDS__SYS_COLUMNS
+		    || (rec_get_1byte_offs_flag(rec)
+			? rec_1_get_field_end_info(rec, 0) != 8
+			: rec_2_get_field_end_info(rec, 0) != 8)) {
+			break;
+		}
+		static_assert(!DICT_FLD__SYS_COLUMNS__TABLE_ID, "");
+		node->trx->evict_table(mach_read_from_8(rec));
+		break;
+	case DICT_INDEXES_ID:
+		ut_ad(node->trx->dict_operation_lock_mode);
+		ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
+		if (!table_id) {
+			table_id = mach_read_from_8(rec);
+			if (table_id) {
+				mtr.commit();
+				goto restart;
+			}
+			ut_ad("corrupted SYS_INDEXES record" == 0);
+		}
+
+		pfs_os_file_t d = OS_FILE_CLOSED;
+
+		const uint32_t space_id = dict_drop_index_tree(
+			&node->pcur, node->trx, &mtr);
+		if (space_id) {
+			if (table) {
+				lock_release_on_rollback(node->trx,
+							 table);
+				if (!dict_locked) {
+					dict_sys.lock(SRW_LOCK_CALL);
+				}
+				if (table->release()) {
+					dict_sys.remove(table);
+				} else if (table->space_id
+					   == space_id) {
+					table->space = nullptr;
+					table->file_unreadable = true;
+				}
+				if (!dict_locked) {
+					dict_sys.unlock();
+				}
+				table = nullptr;
+				if (!mdl_ticket);
+				else if (MDL_context* mdl_context =
+					 static_cast<MDL_context*>(
+						 thd_mdl_context(
+							 node->trx->
+							 mysql_thd))) {
+					mdl_context->release_lock(
+						mdl_ticket);
+					mdl_ticket = nullptr;
+				}
+			}
+
+			d = fil_delete_tablespace(space_id);
+		}
+
+		mtr.commit();
+
+		if (d != OS_FILE_CLOSED) {
+			os_file_close(d);
+		}
+
+		if (space_id) {
+			ibuf_delete_for_discarded_space(space_id);
+		}
+
+		mtr.start();
+		ut_a(node->pcur.restore_position(
+			BTR_MODIFY_LEAF, &mtr) == btr_pcur_t::SAME_ALL);
+	}
+
+	err = btr_cur_optimistic_delete(&node->pcur.btr_cur, 0, &mtr);
+
+	if (err != DB_FAIL) {
+		goto func_exit;
+	}
+
+	btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+retry:
+	/* If did not succeed, try pessimistic descent to tree */
+	mtr.start();
+	if (index->table->is_temporary()) {
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+	} else {
+		index->set_modified(mtr);
+	}
+	ut_a(node->pcur.restore_position(BTR_PURGE_TREE, &mtr)
+	     == btr_pcur_t::SAME_ALL);
+
+	btr_cur_pessimistic_delete(&err, FALSE, &node->pcur.btr_cur, 0, true,
+				   &mtr);
+
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (err == DB_OUT_OF_FILE_SPACE
+	    && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+		btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+		n_tries++;
+
+		std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME);
+
+		goto retry;
+	}
+
+func_exit:
+	if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_INSERT_METADATA) {
+		/* When rolling back the very first instant ADD COLUMN
+		operation, reset the root page to the basic state. */
+		btr_reset_instant(*index, true, &mtr);
+	}
+
+	btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+
+	if (UNIV_LIKELY_NULL(table)) {
+		dict_table_close(table, dict_locked,
+				 node->trx->mysql_thd, mdl_ticket);
+	}
+
+	return(err);
+}
+
+/***************************************************************//**
+Removes a secondary index entry if found.
+@return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_ins_remove_sec_low(
+/*========================*/
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: index entry to remove */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	btr_pcur_t		pcur;
+	dberr_t			err	= DB_SUCCESS;
+	mtr_t			mtr;
+	const bool		modify_leaf = mode == BTR_MODIFY_LEAF;
+
+	pcur.btr_cur.page_cur.index = index;
+	row_mtr_start(&mtr, index, !modify_leaf);
+
+	if (index->is_spatial()) {
+		mode = modify_leaf
+			? btr_latch_mode(BTR_MODIFY_LEAF
+					 | BTR_RTREE_DELETE_MARK
+					 | BTR_RTREE_UNDO_INS)
+			: btr_latch_mode(BTR_PURGE_TREE | BTR_RTREE_UNDO_INS);
+		btr_pcur_get_btr_cur(&pcur)->thr = thr;
+		if (rtr_search(entry, mode, &pcur, &mtr)) {
+			goto func_exit;
+		}
+
+		if (rec_get_deleted_flag(
+			    btr_pcur_get_rec(&pcur),
+			    dict_table_is_comp(index->table))) {
+			ib::error() << "Record found in index " << index->name
+				<< " is deleted marked on insert rollback.";
+			ut_ad(0);
+		}
+		goto found;
+	} else if (modify_leaf) {
+		mode = BTR_MODIFY_LEAF_ALREADY_LATCHED;
+		mtr_s_lock_index(index, &mtr);
+	} else {
+		ut_ad(mode == BTR_PURGE_TREE);
+		mode = BTR_PURGE_TREE_ALREADY_LATCHED;
+		mtr_x_lock_index(index, &mtr);
+	}
+
+	switch (row_search_index_entry(entry, mode, &pcur, &mtr)) {
+	case ROW_BUFFERED:
+	case ROW_NOT_DELETED_REF:
+		/* These are invalid outcomes, because the mode passed
+		to row_search_index_entry() did not include any of the
+		flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+		ut_error;
+	case ROW_NOT_FOUND:
+		break;
+	case ROW_FOUND:
+        found:
+		btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+		if (modify_leaf) {
+			err = btr_cur_optimistic_delete(btr_cur, 0, &mtr);
+		} else {
+			/* Passing rollback=false here, because we are
+			deleting a secondary index record: the distinction
+			only matters when deleting a record that contains
+			externally stored columns. */
+			btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0,
+						   false, &mtr);
+		}
+	}
+
+func_exit:
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/***************************************************************//**
+Removes a secondary index entry from the index if found. Tries first
+optimistic, then pessimistic descent down the tree.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_ins_remove_sec(
+/*====================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: index entry to insert */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+	ulint	n_tries	= 0;
+
+	/* Try first optimistic descent to the B-tree */
+
+	err = row_undo_ins_remove_sec_low(BTR_MODIFY_LEAF, index, entry, thr);
+
+	if (err == DB_SUCCESS) {
+
+		return(err);
+	}
+
+	/* Try then pessimistic descent to the B-tree */
+retry:
+	err = row_undo_ins_remove_sec_low(BTR_PURGE_TREE, index, entry, thr);
+
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (err != DB_SUCCESS && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+		n_tries++;
+
+		std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME);
+
+		goto retry;
+	}
+
+	return(err);
+}
+
+/** Parse an insert undo record.
+@param[in,out]	node		row rollback state
+@param[in]	dict_locked	whether the data dictionary cache is locked */
+static bool row_undo_ins_parse_undo_rec(undo_node_t* node, bool dict_locked)
+{
+	dict_index_t*	clust_index;
+	const byte*	ptr;
+	undo_no_t	undo_no;
+	table_id_t	table_id;
+	byte		dummy;
+	bool		dummy_extern;
+
+	ut_ad(node->trx->in_rollback);
+	ut_ad(trx_undo_roll_ptr_is_insert(node->roll_ptr));
+
+	ptr = trx_undo_rec_get_pars(node->undo_rec, &node->rec_type, &dummy,
+				    &dummy_extern, &undo_no, &table_id);
+
+	node->update = NULL;
+	if (!node->is_temp) {
+		node->table = dict_table_open_on_id(table_id, dict_locked,
+						    DICT_TABLE_OP_NORMAL);
+	} else if (!dict_locked) {
+		dict_sys.freeze(SRW_LOCK_CALL);
+		node->table = dict_sys.acquire_temporary_table(table_id);
+		dict_sys.unfreeze();
+	} else {
+		node->table = dict_sys.acquire_temporary_table(table_id);
+	}
+
+	if (!node->table) {
+		return false;
+	}
+
+	switch (node->rec_type) {
+	default:
+		ut_ad("wrong undo record type" == 0);
+		goto close_table;
+	case TRX_UNDO_INSERT_METADATA:
+	case TRX_UNDO_INSERT_REC:
+	case TRX_UNDO_EMPTY:
+		break;
+	case TRX_UNDO_RENAME_TABLE:
+		dict_table_t* table = node->table;
+		ut_ad(!table->is_temporary());
+		ut_ad(table->file_unreadable
+		      || dict_table_is_file_per_table(table)
+		      == !is_system_tablespace(table->space_id));
+		size_t len = mach_read_from_2(node->undo_rec)
+			- page_offset(ptr) - 2;
+		const span<const char> name(reinterpret_cast<const char*>(ptr),
+					    len);
+		if (strlen(table->name.m_name) != len
+		    || memcmp(table->name.m_name, ptr, len)) {
+			dict_table_rename_in_cache(table, name, true);
+		} else if (table->space && table->space->id) {
+			const auto s = table->space->name();
+			if (len != s.size() || memcmp(ptr, s.data(), len)) {
+				table->rename_tablespace(name, true);
+			}
+		}
+		goto close_table;
+	}
+
+	if (UNIV_UNLIKELY(!node->table->is_accessible())) {
+close_table:
+		/* Normally, tables should not disappear or become
+		unaccessible during ROLLBACK, because they should be
+		protected by InnoDB table locks. Corruption could be
+		a valid exception.
+
+		FIXME: When running out of temporary tablespace, it
+		would probably be better to just drop all temporary
+		tables (and temporary undo log records) of the current
+		connection, instead of doing this rollback. */
+		dict_table_close(node->table, dict_locked);
+		node->table = NULL;
+		return false;
+	} else {
+		ut_ad(!node->table->skip_alter_undo);
+		clust_index = dict_table_get_first_index(node->table);
+
+		if (clust_index != NULL) {
+			switch (node->rec_type) {
+			case TRX_UNDO_INSERT_REC:
+				ptr = trx_undo_rec_get_row_ref(
+					ptr, clust_index, &node->ref,
+					node->heap);
+				break;
+			case TRX_UNDO_EMPTY:
+				node->ref = nullptr;
+				return true;
+			default:
+				node->ref = &trx_undo_metadata;
+				if (!row_undo_search_clust_to_pcur(node)) {
+					/* An error probably occurred during
+					an insert into the clustered index,
+					after we wrote the undo log record. */
+					goto close_table;
+				}
+				return true;
+			}
+
+			if (!row_undo_search_clust_to_pcur(node)) {
+				/* An error probably occurred during
+				an insert into the clustered index,
+				after we wrote the undo log record. */
+				goto close_table;
+			}
+			if (node->table->n_v_cols) {
+				trx_undo_read_v_cols(node->table, ptr,
+						     node->row, false);
+			}
+
+		} else {
+			ib::warn() << "Table " << node->table->name
+				 << " has no indexes,"
+				" ignoring the table";
+			goto close_table;
+		}
+	}
+
+	return true;
+}
+
+/***************************************************************//**
+Removes secondary index records.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_ins_remove_sec_rec(
+/*========================*/
+	undo_node_t*	node,	/*!< in/out: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t		err	= DB_SUCCESS;
+	dict_index_t*	index;
+	mem_heap_t*	heap;
+
+	heap = mem_heap_create(1024);
+
+	for (index = node->index; index;
+             index = dict_table_get_next_index(index)) {
+		if (index->type & (DICT_FTS | DICT_CORRUPT)
+		    || !index->is_committed()) {
+			continue;
+		}
+
+		/* An insert undo record TRX_UNDO_INSERT_REC will
+		always contain all fields of the index. It does not
+		matter if any indexes were created afterwards; all
+		index entries can be reconstructed from the row. */
+		dtuple_t* entry = row_build_index_entry(
+			node->row, node->ext, index, heap);
+		if (UNIV_UNLIKELY(!entry)) {
+			/* The database must have crashed after
+			inserting a clustered index record but before
+			writing all the externally stored columns of
+			that record, or a statement is being rolled
+			back because an error occurred while storing
+			off-page columns.
+
+			Because secondary index entries are inserted
+			after the clustered index record, we may
+			assume that the secondary index record does
+			not exist. */
+		} else {
+			err = row_undo_ins_remove_sec(index, entry, thr);
+
+			if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+				goto func_exit;
+			}
+		}
+
+		mem_heap_empty(heap);
+	}
+
+func_exit:
+	node->index = index;
+	mem_heap_free(heap);
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a fresh insert of a row to a table. A fresh insert means that
+the same clustered index unique key did not have any record, even delete
+marked, at the time of the insert.  InnoDB is eager in a rollback:
+if it figures out that an index record will be removed in the purge
+anyway, it will remove it in the rollback.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+dberr_t
+row_undo_ins(
+/*=========*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+	const bool dict_locked = node->trx->dict_operation_lock_mode;
+
+	if (!row_undo_ins_parse_undo_rec(node, dict_locked)) {
+		return DB_SUCCESS;
+	}
+
+	ut_ad(node->table->is_temporary()
+	      || lock_table_has_locks(node->table));
+
+	/* Iterate over all the indexes and undo the insert.*/
+
+	node->index = dict_table_get_first_index(node->table);
+	ut_ad(dict_index_is_clust(node->index));
+
+	switch (node->rec_type) {
+	default:
+		ut_ad("wrong undo record type" == 0);
+		/* fall through */
+	case TRX_UNDO_INSERT_REC:
+		/* Skip the clustered index (the first index) */
+		node->index = dict_table_get_next_index(node->index);
+
+		err = row_undo_ins_remove_sec_rec(node, thr);
+
+		if (err != DB_SUCCESS) {
+			break;
+		}
+
+		log_free_check();
+
+		if (!dict_locked && node->table->id == DICT_INDEXES_ID) {
+			dict_sys.lock(SRW_LOCK_CALL);
+			err = row_undo_ins_remove_clust_rec(node);
+			dict_sys.unlock();
+		} else {
+			ut_ad(node->table->id != DICT_INDEXES_ID
+			      || !node->table->is_temporary());
+			err = row_undo_ins_remove_clust_rec(node);
+		}
+
+		if (err == DB_SUCCESS && node->table->stat_initialized) {
+			/* Not protected by dict_sys.latch
+			or table->stats_mutex_lock() for
+			performance reasons, we would rather get garbage
+			in stat_n_rows (which is just an estimate anyway)
+			than protecting the following code with a latch. */
+			dict_table_n_rows_dec(node->table);
+
+			/* Do not attempt to update statistics when
+			executing ROLLBACK in the InnoDB SQL
+			interpreter, because in that case we would
+			already be holding dict_sys.latch, which
+			would be acquired when updating statistics. */
+			if (!dict_locked) {
+				dict_stats_update_if_needed(node->table,
+							    *node->trx);
+			}
+		}
+		break;
+
+	case TRX_UNDO_INSERT_METADATA:
+		log_free_check();
+		ut_ad(!node->table->is_temporary());
+		err = row_undo_ins_remove_clust_rec(node);
+		break;
+	case TRX_UNDO_EMPTY:
+		err = node->table->clear(thr);
+		break;
+	}
+
+	dict_table_close(node->table, dict_locked);
+
+	node->table = NULL;
+
+	return(err);
+}
diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc
new file mode 100644
index 00000000..a01eaea5
--- /dev/null
+++ b/storage/innobase/row/row0umod.cc
@@ -0,0 +1,1288 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0umod.cc
+Undo modify of a row
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0umod.h"
+#include "dict0dict.h"
+#include "dict0stats.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "trx0purge.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "ibuf0ibuf.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "log0log.h"
+
+/* Considerations on undoing a modify operation.
+(1) Undoing a delete marking: all index records should be found. Some of
+them may have delete mark already FALSE, if the delete mark operation was
+stopped underway, or if the undo operation ended prematurely because of a
+system crash.
+(2) Undoing an update of a delete unmarked record: the newer version of
+an updated secondary index entry should be removed if no prior version
+of the clustered index record requires its existence. Otherwise, it should
+be delete marked.
+(3) Undoing an update of a delete marked record. In this kind of update a
+delete marked clustered index record was delete unmarked and possibly also
+some of its fields were changed. Now, it is possible that the delete marked
+version has become obsolete at the time the undo is started. */
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***********************************************************//**
+Undoes a modify in a clustered index record.
+@return DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_clust_low(
+/*===================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	rec_offs**	offsets,/*!< out: rec_get_offsets() on the record */
+	mem_heap_t**	offsets_heap,
+				/*!< in/out: memory heap that can be emptied */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
+	byte*		sys,	/*!< out: DB_TRX_ID, DB_ROLL_PTR
+				for row_log_table_delete() */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr,	/*!< in: mtr; must be committed before
+				latching any further pages */
+	btr_latch_mode	mode)	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	dberr_t		err;
+
+	pcur = &node->pcur;
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	if (pcur->restore_position(mode, mtr) != btr_pcur_t::SAME_ALL) {
+		return DB_CORRUPTION;
+	}
+
+	ut_ad(rec_get_trx_id(btr_cur_get_rec(btr_cur),
+			     btr_cur_get_index(btr_cur))
+	      == thr_get_trx(thr)->id
+	      || btr_cur_get_index(btr_cur)->table->is_temporary());
+	ut_ad(node->ref != &trx_undo_metadata
+	      || node->update->info_bits == REC_INFO_METADATA_ADD
+	      || node->update->info_bits == REC_INFO_METADATA_ALTER);
+
+	if (mode != BTR_MODIFY_TREE) {
+		ut_ad(mode == BTR_MODIFY_LEAF
+		      || mode == BTR_MODIFY_LEAF_ALREADY_LATCHED);
+
+		err = btr_cur_optimistic_update(
+			BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG
+			| BTR_KEEP_SYS_FLAG,
+			btr_cur, offsets, offsets_heap,
+			node->update, node->cmpl_info,
+			thr, thr_get_trx(thr)->id, mtr);
+		ut_ad(err != DB_SUCCESS || node->ref != &trx_undo_metadata);
+	} else {
+		big_rec_t*	dummy_big_rec;
+
+		err = btr_cur_pessimistic_update(
+			BTR_NO_LOCKING_FLAG
+			| BTR_NO_UNDO_LOG_FLAG
+			| BTR_KEEP_SYS_FLAG,
+			btr_cur, offsets, offsets_heap, heap,
+			&dummy_big_rec, node->update,
+			node->cmpl_info, thr, thr_get_trx(thr)->id, mtr);
+
+		ut_a(!dummy_big_rec);
+
+		if (err == DB_SUCCESS
+		    && node->ref == &trx_undo_metadata
+		    && btr_cur_get_index(btr_cur)->table->instant
+		    && node->update->info_bits == REC_INFO_METADATA_ADD) {
+			btr_reset_instant(*btr_cur->index(), false, mtr);
+		}
+	}
+
+	if (err != DB_SUCCESS) {
+		return err;
+	}
+
+	switch (const auto id = btr_cur_get_index(btr_cur)->table->id) {
+		unsigned c;
+	case DICT_TABLES_ID:
+		if (node->trx != trx_roll_crash_recv_trx) {
+			break;
+		}
+		c = DICT_COL__SYS_TABLES__ID;
+		goto evict;
+	case DICT_INDEXES_ID:
+		if (node->trx != trx_roll_crash_recv_trx) {
+			break;
+		} else if (node->rec_type == TRX_UNDO_DEL_MARK_REC
+			   && btr_cur_get_rec(btr_cur)
+			   [8 + 8 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]
+			   == static_cast<byte>(*TEMP_INDEX_PREFIX_STR)) {
+			/* We are rolling back the DELETE of metadata
+			for a failed ADD INDEX operation. This does
+			not affect any cached table definition,
+			because we are filtering out such indexes in
+			dict_load_indexes(). */
+			break;
+		}
+		/* fall through */
+	case DICT_COLUMNS_ID:
+		static_assert(!DICT_COL__SYS_INDEXES__TABLE_ID, "");
+		static_assert(!DICT_COL__SYS_COLUMNS__TABLE_ID, "");
+		c = DICT_COL__SYS_COLUMNS__TABLE_ID;
+		/* This is rolling back an UPDATE or DELETE on SYS_COLUMNS.
+		If it was part of an instant ALTER TABLE operation, we
+		must evict the table definition, so that it can be
+		reloaded after the dictionary operation has been
+		completed. At this point, any corresponding operation
+		to the metadata record will have been rolled back. */
+	evict:
+		const dfield_t& table_id = *dtuple_get_nth_field(node->row, c);
+		ut_ad(dfield_get_len(&table_id) == 8);
+		node->trx->evict_table(mach_read_from_8(
+					       static_cast<byte*>(
+						       table_id.data)),
+				       id == DICT_COLUMNS_ID);
+	}
+
+	return DB_SUCCESS;
+}
+
+/** Get the byte offset of the DB_TRX_ID column
+@param[in]	rec	clustered index record
+@param[in]	index	clustered index
+@return	the byte offset of DB_TRX_ID, from the start of rec */
+static ulint row_trx_id_offset(const rec_t* rec, const dict_index_t* index)
+{
+	ut_ad(index->n_uniq <= MAX_REF_PARTS);
+	ulint trx_id_offset = index->trx_id_offset;
+	if (!trx_id_offset) {
+		/* Reserve enough offsets for the PRIMARY KEY and 2 columns
+		so that we can access DB_TRX_ID, DB_ROLL_PTR. */
+		rec_offs offsets_[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2];
+		rec_offs_init(offsets_);
+		mem_heap_t* heap = NULL;
+		const ulint trx_id_pos = index->n_uniq ? index->n_uniq : 1;
+		rec_offs* offsets = rec_get_offsets(rec, index, offsets_,
+						    index->n_core_fields,
+						    trx_id_pos + 1, &heap);
+		ut_ad(!heap);
+		ulint len;
+		trx_id_offset = rec_get_nth_field_offs(
+			offsets, trx_id_pos, &len);
+		ut_ad(len == DATA_TRX_ID_LEN);
+	}
+
+	return trx_id_offset;
+}
+
+/** Determine if rollback must execute a purge-like operation.
+@param node   row undo
+@return	whether the record should be purged */
+static bool row_undo_mod_must_purge(const undo_node_t &node)
+{
+  ut_ad(node.rec_type == TRX_UNDO_UPD_DEL_REC);
+  ut_ad(!node.table->is_temporary());
+
+  const btr_cur_t &btr_cur= node.pcur.btr_cur;
+  ut_ad(btr_cur.index()->is_primary());
+  DEBUG_SYNC_C("rollback_purge_clust");
+
+  if (!purge_sys.is_purgeable(node.new_trx_id))
+    return false;
+
+  const rec_t *rec= btr_cur_get_rec(&btr_cur);
+  return trx_read_trx_id(rec + row_trx_id_offset(rec, btr_cur.index())) ==
+    node.new_trx_id;
+}
+
+/***********************************************************//**
+Undoes a modify in a clustered index record. Sets also the node state for the
+next round of undo.
+@return DB_SUCCESS or error code: we may run out of file space */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_clust(
+/*===============*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	btr_pcur_t*	pcur;
+	mtr_t		mtr;
+	dberr_t		err;
+	dict_index_t*	index;
+
+	ut_ad(thr_get_trx(thr) == node->trx);
+	ut_ad(node->trx->in_rollback);
+
+	log_free_check();
+	pcur = &node->pcur;
+	index = btr_cur_get_index(btr_pcur_get_btr_cur(pcur));
+	ut_ad(index->is_primary());
+
+	mtr.start();
+	if (index->table->is_temporary()) {
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+	} else {
+		index->set_modified(mtr);
+		ut_ad(lock_table_has_locks(index->table));
+	}
+
+	mem_heap_t*	heap		= mem_heap_create(1024);
+	mem_heap_t*	offsets_heap	= NULL;
+	rec_offs*	offsets		= NULL;
+	byte		sys[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
+
+	/* Try optimistic processing of the record, keeping changes within
+	the index page */
+
+	err = row_undo_mod_clust_low(node, &offsets, &offsets_heap,
+				     heap, sys, thr, &mtr, BTR_MODIFY_LEAF);
+
+	if (err != DB_SUCCESS) {
+		btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+		/* We may have to modify tree structure: do a pessimistic
+		descent down the index tree */
+
+		mtr.start();
+		if (index->table->is_temporary()) {
+			mtr.set_log_mode(MTR_LOG_NO_REDO);
+		} else {
+			index->set_modified(mtr);
+		}
+
+		err = row_undo_mod_clust_low(node, &offsets, &offsets_heap,
+					     heap, sys, thr, &mtr,
+					     BTR_MODIFY_TREE);
+		ut_ad(err == DB_SUCCESS || err == DB_OUT_OF_FILE_SPACE);
+	}
+
+	/**
+	* when scrubbing, and records gets cleared,
+	*   the transaction id is not present afterwards.
+	*   this is safe as: since the record is on free-list
+	*   it can be reallocated at any time after this mtr-commits
+	*   which is just below
+	*/
+	ut_ad(srv_immediate_scrub_data_uncompressed
+	      || row_get_rec_trx_id(btr_pcur_get_rec(pcur), index, offsets)
+	      == node->new_trx_id);
+
+	btr_pcur_commit_specify_mtr(pcur, &mtr);
+	DEBUG_SYNC_C("rollback_undo_pk");
+
+	if (err != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	/* FIXME: Perform the below operations in the above
+	mini-transaction when possible. */
+
+	if (node->rec_type == TRX_UNDO_UPD_DEL_REC) {
+		/* In delete-marked records, DB_TRX_ID must
+		always refer to an existing update_undo log record. */
+		ut_ad(node->new_trx_id);
+
+		mtr.start();
+		if (pcur->restore_position(BTR_MODIFY_LEAF, &mtr) !=
+		    btr_pcur_t::SAME_ALL) {
+			goto mtr_commit_exit;
+		}
+
+		ut_ad(rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+					   dict_table_is_comp(node->table)));
+
+		if (index->table->is_temporary()) {
+			mtr.set_log_mode(MTR_LOG_NO_REDO);
+			err = btr_cur_optimistic_delete(&pcur->btr_cur, 0,
+							&mtr);
+			if (err != DB_FAIL) {
+				goto mtr_commit_exit;
+			}
+			err = DB_SUCCESS;
+			btr_pcur_commit_specify_mtr(pcur, &mtr);
+		} else {
+			index->set_modified(mtr);
+			if (!row_undo_mod_must_purge(*node)) {
+				goto mtr_commit_exit;
+			}
+			err = btr_cur_optimistic_delete(&pcur->btr_cur, 0,
+							&mtr);
+			if (err != DB_FAIL) {
+				goto mtr_commit_exit;
+			}
+			err = DB_SUCCESS;
+			btr_pcur_commit_specify_mtr(pcur, &mtr);
+		}
+
+		mtr.start();
+		if (pcur->restore_position(BTR_PURGE_TREE, &mtr) !=
+		    btr_pcur_t::SAME_ALL) {
+			goto mtr_commit_exit;
+		}
+
+		ut_ad(rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+					   dict_table_is_comp(node->table)));
+
+		if (index->table->is_temporary()) {
+			mtr.set_log_mode(MTR_LOG_NO_REDO);
+		} else {
+			if (!row_undo_mod_must_purge(*node)) {
+				goto mtr_commit_exit;
+			}
+			index->set_modified(mtr);
+		}
+
+		/* This operation is analogous to purge, we can free
+		also inherited externally stored fields. We can also
+		assume that the record was complete (including BLOBs),
+		because it had been delete-marked after it had been
+		completely inserted. Therefore, we are passing
+		rollback=false, just like purge does. */
+		btr_cur_pessimistic_delete(&err, FALSE, &pcur->btr_cur, 0,
+					   false, &mtr);
+		ut_ad(err == DB_SUCCESS || err == DB_OUT_OF_FILE_SPACE);
+	} else if (!index->table->is_temporary() && node->new_trx_id) {
+		/* We rolled back a record so that it still exists.
+		We must reset the DB_TRX_ID if the history is no
+		longer accessible by any active read view. */
+
+		mtr.start();
+		if (pcur->restore_position(BTR_MODIFY_LEAF, &mtr)
+		    != btr_pcur_t::SAME_ALL
+		    || !purge_sys.is_purgeable(node->new_trx_id)) {
+			goto mtr_commit_exit;
+		}
+
+		rec_t* rec = btr_pcur_get_rec(pcur);
+		ulint trx_id_offset = index->trx_id_offset;
+		ulint trx_id_pos = index->n_uniq ? index->n_uniq : 1;
+		/* Reserve enough offsets for the PRIMARY KEY and
+		2 columns so that we can access DB_TRX_ID, DB_ROLL_PTR. */
+		rec_offs offsets_[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2];
+		if (trx_id_offset) {
+#ifdef UNIV_DEBUG
+			ut_ad(rec_offs_validate(NULL, index, offsets));
+			if (buf_block_get_page_zip(
+				    btr_pcur_get_block(&node->pcur))) {
+				/* Below, page_zip_write_trx_id_and_roll_ptr()
+				needs offsets to access DB_TRX_ID,DB_ROLL_PTR.
+				We already computed offsets for possibly
+				another record in the clustered index.
+				Because the PRIMARY KEY is fixed-length,
+				the offsets for the PRIMARY KEY and
+				DB_TRX_ID,DB_ROLL_PTR are still valid.
+				Silence the rec_offs_validate() assertion. */
+				rec_offs_make_valid(rec, index, true, offsets);
+			}
+#endif
+		} else if (rec_is_metadata(rec, *index)) {
+			ut_ad(!buf_block_get_page_zip(btr_pcur_get_block(
+							      pcur)));
+			for (unsigned i = index->first_user_field(); i--; ) {
+				trx_id_offset += index->fields[i].fixed_len;
+			}
+		} else {
+			ut_ad(index->n_uniq <= MAX_REF_PARTS);
+			rec_offs_init(offsets_);
+			offsets = rec_get_offsets(rec, index, offsets_,
+						  index->n_core_fields,
+						  trx_id_pos + 2, &heap);
+			ulint len;
+			trx_id_offset = rec_get_nth_field_offs(
+				offsets, trx_id_pos, &len);
+			ut_ad(len == DATA_TRX_ID_LEN);
+		}
+
+		if (trx_read_trx_id(rec + trx_id_offset) == node->new_trx_id) {
+			ut_ad(!rec_get_deleted_flag(
+				      rec, dict_table_is_comp(node->table))
+			      || rec_is_alter_metadata(rec, *index));
+			index->set_modified(mtr);
+			buf_block_t* block = btr_pcur_get_block(pcur);
+			if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+				page_zip_write_trx_id_and_roll_ptr(
+					block, rec, offsets, trx_id_pos,
+					0, 1ULL << ROLL_PTR_INSERT_FLAG_POS,
+					&mtr);
+			} else {
+				size_t offs = page_offset(rec + trx_id_offset);
+				mtr.memset(block, offs, DATA_TRX_ID_LEN, 0);
+				offs += DATA_TRX_ID_LEN;
+				mtr.write<1,mtr_t::MAYBE_NOP>(*block,
+							      block->page.frame
+							      + offs, 0x80U);
+				mtr.memset(block, offs + 1,
+					   DATA_ROLL_PTR_LEN - 1, 0);
+			}
+		}
+	} else {
+		goto func_exit;
+	}
+
+mtr_commit_exit:
+	btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+func_exit:
+	if (offsets_heap) {
+		mem_heap_free(offsets_heap);
+	}
+	mem_heap_free(heap);
+	return(err);
+}
+
+/***********************************************************//**
+Delete marks or removes a secondary index entry if found.
+@return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_del_mark_or_remove_sec_low(
+/*====================================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr,	/*!< in: query thread */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: index entry */
+	btr_latch_mode	mode)	/*!< in: latch mode BTR_MODIFY_LEAF or
+				BTR_MODIFY_TREE */
+{
+	btr_pcur_t		pcur;
+	btr_cur_t*		btr_cur;
+	dberr_t			err	= DB_SUCCESS;
+	mtr_t			mtr;
+	mtr_t			mtr_vers;
+	const bool		modify_leaf = mode == BTR_MODIFY_LEAF;
+
+	row_mtr_start(&mtr, index, !modify_leaf);
+
+	pcur.btr_cur.page_cur.index = index;
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	if (index->is_spatial()) {
+		mode = modify_leaf
+			? btr_latch_mode(BTR_MODIFY_LEAF
+					 | BTR_RTREE_DELETE_MARK
+					 | BTR_RTREE_UNDO_INS)
+			: btr_latch_mode(BTR_PURGE_TREE | BTR_RTREE_UNDO_INS);
+		btr_cur->thr = thr;
+		if (UNIV_LIKELY(!rtr_search(entry, mode, &pcur, &mtr))) {
+			goto found;
+		} else {
+			goto func_exit;
+		}
+	} else if (!index->is_committed()) {
+		/* The index->online_status may change if the index is
+		or was being created online, but not committed yet. It
+		is protected by index->lock. */
+		if (modify_leaf) {
+			mode = BTR_MODIFY_LEAF_ALREADY_LATCHED;
+			mtr_s_lock_index(index, &mtr);
+		} else {
+			ut_ad(mode == BTR_PURGE_TREE);
+			mode = BTR_PURGE_TREE_ALREADY_LATCHED;
+			mtr_x_lock_index(index, &mtr);
+		}
+	} else {
+		/* For secondary indexes,
+		index->online_status==ONLINE_INDEX_COMPLETE if
+		index->is_committed(). */
+		ut_ad(!dict_index_is_online_ddl(index));
+	}
+
+	switch (UNIV_EXPECT(row_search_index_entry(entry, mode, &pcur, &mtr),
+			    ROW_FOUND)) {
+	case ROW_NOT_FOUND:
+		/* In crash recovery, the secondary index record may
+		be missing if the UPDATE did not have time to insert
+		the secondary index records before the crash.  When we
+		are undoing that UPDATE in crash recovery, the record
+		may be missing.
+
+		In normal processing, if an update ends in a deadlock
+		before it has inserted all updated secondary index
+		records, then the undo will not find those records. */
+		goto func_exit;
+	case ROW_FOUND:
+		break;
+	case ROW_BUFFERED:
+	case ROW_NOT_DELETED_REF:
+		/* These are invalid outcomes, because the mode passed
+		to row_search_index_entry() did not include any of the
+		flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+		ut_error;
+	}
+
+found:
+	/* We should remove the index record if no prior version of the row,
+	which cannot be purged yet, requires its existence. If some requires,
+	we should delete mark the record. */
+
+	mtr_vers.start();
+
+	ut_a(node->pcur.restore_position(BTR_SEARCH_LEAF, &mtr_vers) ==
+	      btr_pcur_t::SAME_ALL);
+
+	/* For temporary table, we can skip to check older version of
+	clustered index entry, because there is no MVCC or purge. */
+	if (node->table->is_temporary()
+	    || row_vers_old_has_index_entry(
+		    false, btr_pcur_get_rec(&node->pcur),
+		    &mtr_vers, index, entry, 0, 0)) {
+		btr_rec_set_deleted<true>(btr_cur_get_block(btr_cur),
+					  btr_cur_get_rec(btr_cur), &mtr);
+	} else {
+		/* Remove the index record */
+
+		if (dict_index_is_spatial(index)) {
+			rec_t*	rec = btr_pcur_get_rec(&pcur);
+			if (rec_get_deleted_flag(rec,
+						 dict_table_is_comp(index->table))) {
+				ib::error() << "Record found in index "
+					<< index->name << " is deleted marked"
+					" on rollback update.";
+				ut_ad(0);
+			}
+		}
+
+		if (modify_leaf) {
+			err = btr_cur_optimistic_delete(btr_cur, 0, &mtr);
+		} else {
+			/* Passing rollback=false,
+			because we are deleting a secondary index record:
+			the distinction only matters when deleting a
+			record that contains externally stored columns. */
+			ut_ad(!index->is_primary());
+			btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0,
+						   false, &mtr);
+
+			/* The delete operation may fail if we have little
+			file space left: TODO: easiest to crash the database
+			and restart with more file space */
+		}
+	}
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers);
+
+func_exit:
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/***********************************************************//**
+Delete marks or removes a secondary index entry if found.
+NOTE that if we updated the fields of a delete-marked secondary index record
+so that alphabetically they stayed the same, e.g., 'abc' -> 'aBc', we cannot
+return to the original values because we do not know them. But this should
+not cause problems because in row0sel.cc, in queries we always retrieve the
+clustered index record or an earlier version of it, if the secondary index
+record through which we do the search is delete-marked.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_del_mark_or_remove_sec(
+/*================================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr,	/*!< in: query thread */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry)	/*!< in: index entry */
+{
+	dberr_t	err;
+
+	err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+						      entry, BTR_MODIFY_LEAF);
+	if (err == DB_SUCCESS) {
+
+		return(err);
+	}
+
+	err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+		entry, BTR_PURGE_TREE);
+	return(err);
+}
+
+/***********************************************************//**
+Delete unmarks a secondary index entry which must be found. It might not be
+delete-marked at the moment, but it does not harm to unmark it anyway. We also
+need to update the fields of the secondary index record if we updated its
+fields but alphabetically they stayed the same, e.g., 'abc' -> 'aBc'.
+@retval DB_SUCCESS on success
+@retval DB_FAIL if BTR_MODIFY_TREE should be tried
+@retval DB_OUT_OF_FILE_SPACE when running out of tablespace
+@retval DB_DUPLICATE_KEY if the value was missing
+	and an insert would lead to a duplicate exists */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_del_unmark_sec_and_undo_update(
+/*========================================*/
+	btr_latch_mode	mode,	/*!< in: search mode: BTR_MODIFY_LEAF or
+				BTR_MODIFY_TREE */
+	que_thr_t*	thr,	/*!< in: query thread */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry)	/*!< in: index entry */
+{
+	btr_pcur_t		pcur;
+	btr_cur_t*		btr_cur		= btr_pcur_get_btr_cur(&pcur);
+	upd_t*			update;
+	dberr_t			err		= DB_SUCCESS;
+	big_rec_t*		dummy_big_rec;
+	mtr_t			mtr;
+	trx_t*			trx		= thr_get_trx(thr);
+	const ulint		flags
+		= BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG;
+	const auto		orig_mode = mode;
+
+	pcur.btr_cur.page_cur.index = index;
+	ut_ad(trx->id != 0);
+
+	if (index->is_spatial()) {
+		/* FIXME: Currently we do a 2-pass search for the undo
+		due to avoid undel-mark a wrong rec in rolling back in
+		partial update.  Later, we could log some info in
+		secondary index updates to avoid this. */
+		static_assert(BTR_MODIFY_TREE == (8 | BTR_MODIFY_LEAF), "");
+		ut_ad(!(mode & 8));
+		mode = btr_latch_mode(mode | BTR_RTREE_DELETE_MARK);
+	}
+
+try_again:
+	row_mtr_start(&mtr, index, mode & 8);
+
+	btr_cur->thr = thr;
+
+	if (index->is_spatial()) {
+		if (!rtr_search(entry, mode, &pcur, &mtr)) {
+			goto found;
+		}
+
+		if (mode != orig_mode && btr_cur->rtr_info->fd_del) {
+			mode = orig_mode;
+			btr_pcur_close(&pcur);
+			mtr.commit();
+			goto try_again;
+		}
+
+		goto not_found;
+	}
+
+	switch (row_search_index_entry(entry, mode, &pcur, &mtr)) {
+		mem_heap_t*	heap;
+		mem_heap_t*	offsets_heap;
+		rec_offs*	offsets;
+	case ROW_BUFFERED:
+	case ROW_NOT_DELETED_REF:
+		/* These are invalid outcomes, because the mode passed
+		to row_search_index_entry() did not include any of the
+		flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+		ut_error;
+	case ROW_NOT_FOUND:
+not_found:
+		if (btr_cur->up_match >= dict_index_get_n_unique(index)
+		    || btr_cur->low_match >= dict_index_get_n_unique(index)) {
+			ib::warn() << "Record in index " << index->name
+				<< " of table " << index->table->name
+				<< " was not found on rollback, and"
+				" a duplicate exists: "
+				<< *entry
+				<< " at: " << rec_index_print(
+					btr_cur_get_rec(btr_cur), index);
+			err = DB_DUPLICATE_KEY;
+			break;
+		}
+
+		ib::warn() << "Record in index " << index->name
+			<< " of table " << index->table->name
+			<< " was not found on rollback, trying to insert: "
+			<< *entry
+			<< " at: " << rec_index_print(
+				btr_cur_get_rec(btr_cur), index);
+
+		/* Insert the missing record that we were trying to
+		delete-unmark. */
+		big_rec_t*	big_rec;
+		rec_t*		insert_rec;
+		offsets = NULL;
+		offsets_heap = NULL;
+
+		err = btr_cur_optimistic_insert(
+			flags, btr_cur, &offsets, &offsets_heap,
+			entry, &insert_rec, &big_rec,
+			0, thr, &mtr);
+		ut_ad(!big_rec);
+
+		if (err == DB_FAIL && mode == BTR_MODIFY_TREE) {
+			err = btr_cur_pessimistic_insert(
+				flags, btr_cur,
+				&offsets, &offsets_heap,
+				entry, &insert_rec, &big_rec,
+				0, thr, &mtr);
+			/* There are no off-page columns in
+			secondary indexes. */
+			ut_ad(!big_rec);
+		}
+
+		if (err == DB_SUCCESS) {
+			page_update_max_trx_id(
+				btr_cur_get_block(btr_cur),
+				btr_cur_get_page_zip(btr_cur),
+				trx->id, &mtr);
+		}
+
+		if (offsets_heap) {
+			mem_heap_free(offsets_heap);
+		}
+
+		break;
+	case ROW_FOUND:
+found:
+		btr_rec_set_deleted<false>(btr_cur_get_block(btr_cur),
+					   btr_cur_get_rec(btr_cur), &mtr);
+		heap = mem_heap_create(
+			sizeof(upd_t)
+			+ dtuple_get_n_fields(entry) * sizeof(upd_field_t));
+		offsets_heap = NULL;
+		offsets = rec_get_offsets(
+			btr_cur_get_rec(btr_cur),
+			index, nullptr, index->n_core_fields, ULINT_UNDEFINED,
+			&offsets_heap);
+		update = row_upd_build_sec_rec_difference_binary(
+			btr_cur_get_rec(btr_cur), index, offsets, entry, heap);
+		if (upd_get_n_fields(update) == 0) {
+
+			/* Do nothing */
+
+		} else if (mode != BTR_MODIFY_TREE) {
+			/* Try an optimistic updating of the record, keeping
+			changes within the page */
+
+			/* TODO: pass offsets, not &offsets */
+			err = btr_cur_optimistic_update(
+				flags, btr_cur, &offsets, &offsets_heap,
+				update, 0, thr, thr_get_trx(thr)->id, &mtr);
+			switch (err) {
+			case DB_OVERFLOW:
+			case DB_UNDERFLOW:
+			case DB_ZIP_OVERFLOW:
+				err = DB_FAIL;
+			default:
+				break;
+			}
+		} else {
+			err = btr_cur_pessimistic_update(
+				flags, btr_cur, &offsets, &offsets_heap,
+				heap, &dummy_big_rec,
+				update, 0, thr, thr_get_trx(thr)->id, &mtr);
+			ut_a(!dummy_big_rec);
+		}
+
+		mem_heap_free(heap);
+		mem_heap_free(offsets_heap);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is UPD_DEL.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_upd_del_sec(
+/*=====================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	mem_heap_t*	heap;
+	dberr_t		err	= DB_SUCCESS;
+
+	ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
+	ut_ad(!node->undo_row);
+
+	heap = mem_heap_create(1024);
+
+	do {
+		dict_index_t* index = node->index;
+
+		if (index->type & (DICT_FTS | DICT_CORRUPT)
+		    || !index->is_committed()) {
+			continue;
+		}
+
+		/* During online index creation,
+		HA_ALTER_INPLACE_COPY_NO_LOCK or HA_ALTER_INPLACE_NOCOPY_NO_LOCk
+		should guarantee that any active transaction has not modified
+		indexed columns such that col->ord_part was 0 at the
+		time when the undo log record was written. When we get
+		to roll back an undo log entry TRX_UNDO_DEL_MARK_REC,
+		it should always cover all affected indexes. */
+		dtuple_t* entry = row_build_index_entry(
+			node->row, node->ext, index, heap);
+
+		if (UNIV_UNLIKELY(!entry)) {
+			/* The database must have crashed after
+			inserting a clustered index record but before
+			writing all the externally stored columns of
+			that record.  Because secondary index entries
+			are inserted after the clustered index record,
+			we may assume that the secondary index record
+			does not exist.  However, this situation may
+			only occur during the rollback of incomplete
+			transactions. */
+			ut_a(thr_get_trx(thr) == trx_roll_crash_recv_trx);
+		} else {
+			err = row_undo_mod_del_mark_or_remove_sec(
+				node, thr, index, entry);
+
+			if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+
+				break;
+			}
+		}
+
+		mem_heap_empty(heap);
+	} while ((node->index = dict_table_get_next_index(node->index)));
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is DEL_MARK.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_del_mark_sec(
+/*======================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	mem_heap_t*	heap;
+	dberr_t		err	= DB_SUCCESS;
+
+	ut_ad(!node->undo_row);
+
+	heap = mem_heap_create(1024);
+
+	do {
+		dict_index_t* index = node->index;
+
+		if (index->type & (DICT_FTS | DICT_CORRUPT)
+		    || !index->is_committed()) {
+			continue;
+		}
+
+		/* During online index creation,
+		HA_ALTER_INPLACE_COPY_NO_LOCK or HA_ALTER_INPLACE_NOCOPY_NO_LOCK
+		should guarantee that any active transaction has not modified
+		indexed columns such that col->ord_part was 0 at the
+		time when the undo log record was written. When we get
+		to roll back an undo log entry TRX_UNDO_DEL_MARK_REC,
+		it should always cover all affected indexes. */
+		dtuple_t* entry = row_build_index_entry(
+			node->row, node->ext, index, heap);
+
+		ut_a(entry);
+
+		err = row_undo_mod_del_unmark_sec_and_undo_update(
+			BTR_MODIFY_LEAF, thr, index, entry);
+		if (err == DB_FAIL) {
+			err = row_undo_mod_del_unmark_sec_and_undo_update(
+				BTR_MODIFY_TREE, thr, index, entry);
+		}
+
+		if (err == DB_DUPLICATE_KEY) {
+			index->type |= DICT_CORRUPT;
+			err = DB_SUCCESS;
+			/* Do not return any error to the caller. The
+			duplicate will be reported by ALTER TABLE or
+			CREATE UNIQUE INDEX. Unfortunately we cannot
+			report the duplicate key value to the DDL
+			thread, because the altered_table object is
+			private to its call stack. */
+		} else if (err != DB_SUCCESS) {
+			break;
+		}
+
+		mem_heap_empty(heap);
+	} while ((node->index = dict_table_get_next_index(node->index)));
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is UPD_EXIST.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_upd_exist_sec(
+/*=======================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+		return DB_SUCCESS;
+	}
+
+	mem_heap_t* heap = mem_heap_create(1024);
+	dberr_t err = DB_SUCCESS;
+
+	do {
+		dict_index_t* index = node->index;
+
+		if (index->type & (DICT_FTS | DICT_CORRUPT)
+		    || !index->is_committed()) {
+			continue;
+		}
+
+		if (!row_upd_changes_ord_field_binary_func(
+			index, node->update,
+#ifdef UNIV_DEBUG
+			thr,
+#endif /* UNIV_DEBUG */
+			node->row, node->ext, ROW_BUILD_FOR_UNDO)) {
+			continue;
+		}
+
+		/* Build the newest version of the index entry */
+		dtuple_t* entry = row_build_index_entry(
+			node->row, node->ext, index, heap);
+		if (UNIV_UNLIKELY(!entry)) {
+			/* The server must have crashed in
+			row_upd_clust_rec_by_insert() before
+			the updated externally stored columns (BLOBs)
+			of the new clustered index entry were written. */
+
+			/* The table must be in DYNAMIC or COMPRESSED
+			format.  REDUNDANT and COMPACT formats
+			store a local 768-byte prefix of each
+			externally stored column. */
+			ut_a(dict_table_has_atomic_blobs(index->table));
+
+			/* This is only legitimate when
+			rolling back an incomplete transaction
+			after crash recovery. */
+			ut_a(thr_get_trx(thr)->is_recovered);
+
+			/* The server must have crashed before
+			completing the insert of the new
+			clustered index entry and before
+			inserting to the secondary indexes.
+			Because node->row was not yet written
+			to this index, we can ignore it.  But
+			we must restore node->undo_row. */
+		} else {
+			/* NOTE that if we updated the fields of a
+			delete-marked secondary index record so that
+			alphabetically they stayed the same, e.g.,
+			'abc' -> 'aBc', we cannot return to the
+			original values because we do not know them.
+			But this should not cause problems because
+			in row0sel.cc, in queries we always retrieve
+			the clustered index record or an earlier
+			version of it, if the secondary index record
+			through which we do the search is
+			delete-marked. */
+
+			err = row_undo_mod_del_mark_or_remove_sec(
+				node, thr, index, entry);
+			if (err != DB_SUCCESS) {
+				break;
+			}
+		}
+
+		mem_heap_empty(heap);
+		/* We may have to update the delete mark in the
+		secondary index record of the previous version of
+		the row. We also need to update the fields of
+		the secondary index record if we updated its fields
+		but alphabetically they stayed the same, e.g.,
+		'abc' -> 'aBc'. */
+		entry = row_build_index_entry_low(node->undo_row,
+						  node->undo_ext,
+						  index, heap,
+						  ROW_BUILD_FOR_UNDO);
+		ut_a(entry);
+
+		err = row_undo_mod_del_unmark_sec_and_undo_update(
+			BTR_MODIFY_LEAF, thr, index, entry);
+		if (err == DB_FAIL) {
+			err = row_undo_mod_del_unmark_sec_and_undo_update(
+				BTR_MODIFY_TREE, thr, index, entry);
+		}
+
+		if (err == DB_DUPLICATE_KEY) {
+			index->type |= DICT_CORRUPT;
+			err = DB_SUCCESS;
+		} else if (err != DB_SUCCESS) {
+			break;
+		}
+
+		mem_heap_empty(heap);
+	} while ((node->index = dict_table_get_next_index(node->index)));
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/** Parse an update undo record.
+@param[in,out]	node		row rollback state
+@param[in]	dict_locked	whether the data dictionary cache is locked */
+static bool row_undo_mod_parse_undo_rec(undo_node_t* node, bool dict_locked)
+{
+	dict_index_t*	clust_index;
+	undo_no_t	undo_no;
+	table_id_t	table_id;
+	trx_id_t	trx_id;
+	roll_ptr_t	roll_ptr;
+	byte		info_bits;
+	byte		type;
+	byte		cmpl_info;
+	bool		dummy_extern;
+
+	ut_ad(node->trx->in_rollback);
+	ut_ad(!trx_undo_roll_ptr_is_insert(node->roll_ptr));
+
+	const byte *ptr = trx_undo_rec_get_pars(
+		node->undo_rec, &type, &cmpl_info,
+		&dummy_extern, &undo_no, &table_id);
+	node->rec_type = type;
+
+	if (!node->is_temp) {
+		node->table = dict_table_open_on_id(table_id, dict_locked,
+						    DICT_TABLE_OP_NORMAL);
+	} else if (!dict_locked) {
+		dict_sys.freeze(SRW_LOCK_CALL);
+		node->table = dict_sys.acquire_temporary_table(table_id);
+		dict_sys.unfreeze();
+	} else {
+		node->table = dict_sys.acquire_temporary_table(table_id);
+	}
+
+	if (!node->table) {
+		return false;
+	}
+
+	ut_ad(!node->table->skip_alter_undo);
+
+	if (UNIV_UNLIKELY(!node->table->is_accessible())) {
+close_table:
+		/* Normally, tables should not disappear or become
+		unaccessible during ROLLBACK, because they should be
+		protected by InnoDB table locks. Corruption could be
+		a valid exception.
+
+		FIXME: When running out of temporary tablespace, it
+		would probably be better to just drop all temporary
+		tables (and temporary undo log records) of the current
+		connection, instead of doing this rollback. */
+		dict_table_close(node->table, dict_locked);
+		node->table = NULL;
+		return false;
+	}
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+					       &info_bits);
+
+	ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+				       node->heap);
+
+	ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
+				       roll_ptr, info_bits,
+				       node->heap, &(node->update));
+	node->new_trx_id = trx_id;
+	node->cmpl_info = cmpl_info;
+	ut_ad(!node->ref->info_bits);
+
+	if (node->update->info_bits & REC_INFO_MIN_REC_FLAG) {
+		if ((node->update->info_bits & ~REC_INFO_DELETED_FLAG)
+		    != REC_INFO_MIN_REC_FLAG) {
+			ut_ad("wrong info_bits in undo log record" == 0);
+			goto close_table;
+		}
+		/* This must be an undo log record for a subsequent
+		instant ALTER TABLE, extending the metadata record. */
+		ut_ad(clust_index->is_instant());
+		ut_ad(clust_index->table->instant
+		      || !(node->update->info_bits & REC_INFO_DELETED_FLAG));
+		node->ref = &trx_undo_metadata;
+		node->update->info_bits = (node->update->info_bits
+					   & REC_INFO_DELETED_FLAG)
+			? REC_INFO_METADATA_ALTER
+			: REC_INFO_METADATA_ADD;
+	}
+
+	if (!row_undo_search_clust_to_pcur(node)) {
+		/* As long as this rolling-back transaction exists,
+		the PRIMARY KEY value pointed to by the undo log
+		record should exist.
+
+		However, if InnoDB is killed during a rollback, or
+		shut down during the rollback of recovered
+		transactions, then after restart we may try to roll
+		back some of the same undo log records again, because
+		trx_roll_try_truncate() is not being invoked after
+		every undo log record.
+
+		It is also possible that the record
+		was not modified yet (the DB_ROLL_PTR does not match
+		node->roll_ptr) and thus there is nothing to roll back.
+
+		btr_cur_upd_lock_and_undo() only writes the undo log
+		record after successfully acquiring an exclusive lock
+		on the the clustered index record. That lock will not
+		be released before the transaction is committed or
+		fully rolled back. (Exception: if the server was
+		killed, restarted, and shut down again before the
+		rollback of the recovered transaction was completed,
+		it is possible that the transaction was partially
+		rolled back and locks released.) */
+		goto close_table;
+	}
+
+	/* Extract indexed virtual columns from undo log */
+	if (node->ref != &trx_undo_metadata && node->table->n_v_cols) {
+		row_upd_replace_vcol(node->row, node->table,
+				     node->update, false, node->undo_row,
+				     (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)
+				     ? nullptr : ptr);
+	}
+
+	return true;
+}
+
+/***********************************************************//**
+Undoes a modify operation on a row of a table.
+@return DB_SUCCESS or error code */
+dberr_t
+row_undo_mod(
+/*=========*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t	err = DB_SUCCESS;
+	ut_ad(thr_get_trx(thr) == node->trx);
+	const bool dict_locked = node->trx->dict_operation_lock_mode;
+
+	if (!row_undo_mod_parse_undo_rec(node, dict_locked)) {
+		return DB_SUCCESS;
+	}
+
+	ut_ad(node->table->is_temporary()
+	      || lock_table_has_locks(node->table));
+	node->index = dict_table_get_first_index(node->table);
+	ut_ad(dict_index_is_clust(node->index));
+
+	if (node->ref->info_bits) {
+		ut_ad(node->ref->is_metadata());
+		goto rollback_clust;
+	}
+
+	/* Skip the clustered index (the first index) */
+	node->index = dict_table_get_next_index(node->index);
+	if (node->index) {
+		switch (node->rec_type) {
+		case TRX_UNDO_UPD_EXIST_REC:
+			err = row_undo_mod_upd_exist_sec(node, thr);
+			break;
+		case TRX_UNDO_DEL_MARK_REC:
+			err = row_undo_mod_del_mark_sec(node, thr);
+			break;
+		case TRX_UNDO_UPD_DEL_REC:
+			err = row_undo_mod_upd_del_sec(node, thr);
+			break;
+		default:
+			MY_ASSERT_UNREACHABLE();
+		}
+	}
+
+	if (err == DB_SUCCESS) {
+rollback_clust:
+		err = row_undo_mod_clust(node, thr);
+
+		bool update_statistics
+			= !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE);
+
+		if (err == DB_SUCCESS && node->table->stat_initialized) {
+			switch (node->rec_type) {
+			case TRX_UNDO_UPD_EXIST_REC:
+				break;
+			case TRX_UNDO_DEL_MARK_REC:
+				dict_table_n_rows_inc(node->table);
+				update_statistics = update_statistics
+					|| !srv_stats_include_delete_marked;
+				break;
+			case TRX_UNDO_UPD_DEL_REC:
+				dict_table_n_rows_dec(node->table);
+				update_statistics = update_statistics
+					|| !srv_stats_include_delete_marked;
+				break;
+			}
+
+			/* Do not attempt to update statistics when
+			executing ROLLBACK in the InnoDB SQL
+			interpreter, because in that case we would
+			already be holding dict_sys.latch, which
+			would be acquired when updating statistics. */
+			if (update_statistics && !dict_locked) {
+				dict_stats_update_if_needed(node->table,
+							    *node->trx);
+			} else {
+				node->table->stat_modified_counter++;
+			}
+		}
+	}
+
+	dict_table_close(node->table, dict_locked);
+
+	node->table = NULL;
+
+	return(err);
+}
diff --git a/storage/innobase/row/row0undo.cc b/storage/innobase/row/row0undo.cc
new file mode 100644
index 00000000..8a1041c8
--- /dev/null
+++ b/storage/innobase/row/row0undo.cc
@@ -0,0 +1,453 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0undo.cc
+Row undo
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0undo.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0uins.h"
+#include "row0umod.h"
+#include "row0upd.h"
+#include "row0mysql.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+
+/* How to undo row operations?
+(1) For an insert, we have stored a prefix of the clustered index record
+in the undo log. Using it, we look for the clustered record, and using
+that we look for the records in the secondary indexes. The insert operation
+may have been left incomplete, if the database crashed, for example.
+We may have look at the trx id and roll ptr to make sure the record in the
+clustered index is really the one for which the undo log record was
+written. We can use the framework we get from the original insert op.
+(2) Delete marking: We can use the framework we get from the original
+delete mark op. We only have to check the trx id.
+(3) Update: This may be the most complicated. We have to use the framework
+we get from the original update op.
+
+What if the same trx repeatedly deletes and inserts an identical row.
+Then the row id changes and also roll ptr. What if the row id was not
+part of the ordering fields in the clustered index? Maybe we have to write
+it to undo log. Well, maybe not, because if we order the row id and trx id
+in descending order, then the only undeleted copy is the first in the
+index. Our searches in row operations always position the cursor before
+the first record in the result set. But, if there is no key defined for
+a table, then it would be desirable that row id is in ascending order.
+So, lets store row id in descending order only if it is not an ordering
+field in the clustered index.
+
+NOTE: Deletes and inserts may lead to situation where there are identical
+records in a secondary index. Is that a problem in the B-tree? Yes.
+Also updates can lead to this, unless trx id and roll ptr are included in
+ord fields.
+(1) Fix in clustered indexes: include row id, trx id, and roll ptr
+in node pointers of B-tree.
+(2) Fix in secondary indexes: include all fields in node pointers, and
+if an entry is inserted, check if it is equal to the right neighbor,
+in which case update the right neighbor: the neighbor must be delete
+marked, set it unmarked and write the trx id of the current transaction.
+
+What if the same trx repeatedly updates the same row, updating a secondary
+index field or not? Updating a clustered index ordering field?
+
+(1) If it does not update the secondary index and not the clustered index
+ord field. Then the secondary index record stays unchanged, but the
+trx id in the secondary index record may be smaller than in the clustered
+index record. This is no problem?
+(2) If it updates secondary index ord field but not clustered: then in
+secondary index there are delete marked records, which differ in an
+ord field. No problem.
+(3) Updates clustered ord field but not secondary, and secondary index
+is unique. Then the record in secondary index is just updated at the
+clustered ord field.
+(4)
+
+Problem with duplicate records:
+Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a
+bigger trx id has inserted and delete marked a similar row, our trx inserts
+again a similar row, and a trx with an even bigger id delete marks it. Then
+the position of the row should change in the index if the trx id affects
+the alphabetical ordering.
+
+Fix 2: If an insert encounters a similar row marked deleted, we turn the
+insert into an 'update' of the row marked deleted. Then we must write undo
+info on the update. A problem: what if a purge operation tries to remove
+the delete marked row?
+
+We can think of the database row versions as a linked list which starts
+from the record in the clustered index, and is linked by roll ptrs
+through undo logs. The secondary index records are references which tell
+what kinds of records can be found in this linked list for a record
+in the clustered index.
+
+How to do the purge? A record can be removed from the clustered index
+if its linked list becomes empty, i.e., the row has been marked deleted
+and its roll ptr points to the record in the undo log we are going through,
+doing the purge. Similarly, during a rollback, a record can be removed
+if the stored roll ptr in the undo log points to a trx already (being) purged,
+or if the roll ptr is NULL, i.e., it was a fresh insert. */
+
+/********************************************************************//**
+Creates a row undo node to a query graph.
+@return own: undo node */
+undo_node_t*
+row_undo_node_create(
+/*=================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	que_thr_t*	parent,	/*!< in: parent node, i.e., a thr node */
+	mem_heap_t*	heap)	/*!< in: memory heap where created */
+{
+	undo_node_t*	undo;
+
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)
+	      || trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)
+	      || trx_state_eq(trx, TRX_STATE_PREPARED));
+	ut_ad(parent);
+
+	undo = static_cast<undo_node_t*>(
+		mem_heap_alloc(heap, sizeof(undo_node_t)));
+
+	undo->common.type = QUE_NODE_UNDO;
+	undo->common.parent = parent;
+
+	undo->trx = trx;
+
+	btr_pcur_init(&(undo->pcur));
+
+	undo->heap = mem_heap_create(256);
+
+	return(undo);
+}
+
+/***********************************************************//**
+Looks for the clustered index record when node has the row reference.
+The pcur in node is used in the search. If found, stores the row to node,
+and stores the position of pcur, and detaches it. The pcur must be closed
+by the caller in any case.
+@return true if found; NOTE the node->pcur must be closed by the
+caller, regardless of the return value */
+bool
+row_undo_search_clust_to_pcur(
+/*==========================*/
+	undo_node_t*	node)	/*!< in/out: row undo node */
+{
+	dict_index_t*	clust_index;
+	bool		found;
+	mtr_t		mtr;
+	row_ext_t**	ext;
+	const rec_t*	rec;
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(!node->table->skip_alter_undo);
+
+	mtr_start(&mtr);
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	found = row_search_on_row_ref(&node->pcur, BTR_MODIFY_LEAF,
+				      node->table, node->ref, &mtr);
+
+	if (!found) {
+		goto func_exit;
+	}
+
+	rec = btr_pcur_get_rec(&node->pcur);
+
+	offsets = rec_get_offsets(rec, clust_index, offsets,
+				  clust_index->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+
+	found = row_get_rec_roll_ptr(rec, clust_index, offsets)
+		== node->roll_ptr;
+
+	if (found) {
+		ut_ad(row_get_rec_trx_id(rec, clust_index, offsets)
+		      == node->trx->id || node->table->is_temporary());
+
+		if (dict_table_has_atomic_blobs(node->table)) {
+			/* There is no prefix of externally stored
+			columns in the clustered index record. Build a
+			cache of column prefixes. */
+			ext = &node->ext;
+		} else {
+			/* REDUNDANT and COMPACT formats store a local
+			768-byte prefix of each externally stored
+			column. No cache is needed. */
+			ext = NULL;
+			node->ext = NULL;
+		}
+
+		node->row = row_build(ROW_COPY_DATA, clust_index, rec,
+				      offsets, NULL,
+				      NULL, NULL, ext, node->heap);
+
+		/* We will need to parse out virtual column info from undo
+		log, first mark them DATA_MISSING. So we will know if the
+		value gets updated */
+		if (node->table->n_v_cols
+		    && !trx_undo_roll_ptr_is_insert(node->roll_ptr)
+		    && !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+			for (ulint i = 0;
+			     i < dict_table_get_n_v_cols(node->table); i++) {
+				dfield_get_type(dtuple_get_nth_v_field(
+					node->row, i))->mtype = DATA_MISSING;
+			}
+		}
+
+		if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+			ut_ad((node->row->info_bits & ~REC_INFO_DELETED_FLAG)
+			      == REC_INFO_MIN_REC_FLAG
+			      || node->row->info_bits == 0);
+			node->undo_row = dtuple_copy(node->row, node->heap);
+			row_upd_replace(node->undo_row, &node->undo_ext,
+					clust_index, node->update, node->heap);
+		} else {
+			ut_ad(((node->row->info_bits & ~REC_INFO_DELETED_FLAG)
+			       == REC_INFO_MIN_REC_FLAG)
+			      == (node->rec_type == TRX_UNDO_INSERT_METADATA));
+			node->undo_row = NULL;
+			node->undo_ext = NULL;
+		}
+
+		btr_pcur_store_position(&node->pcur, &mtr);
+	}
+
+	if (heap) {
+		mem_heap_free(heap);
+	}
+
+func_exit:
+	btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+	return(found);
+}
+
+/** Get the latest undo log record for rollback.
+@param[in,out]	node		rollback context
+@return	undo block for the undo log record
+@retval nullptr if no undo log record was fetched */
+static buf_block_t* row_undo_rec_get(undo_node_t* node)
+{
+	trx_t* trx = node->trx;
+
+	if (trx->pages_undone) {
+		trx->pages_undone = 0;
+		trx_undo_try_truncate(*trx);
+	}
+
+	trx_undo_t*	undo	= NULL;
+	trx_undo_t*	update	= trx->rsegs.m_redo.undo;
+	trx_undo_t*	temp	= trx->rsegs.m_noredo.undo;
+	const undo_no_t	limit	= trx->roll_limit;
+	node->is_temp = false;
+
+	ut_ad(!update || !temp || update->empty() || temp->empty()
+	      || update->top_undo_no != temp->top_undo_no);
+
+	if (update && !update->empty() && update->top_undo_no >= limit) {
+		if (!undo) {
+			undo = update;
+		} else if (undo->top_undo_no < update->top_undo_no) {
+			undo = update;
+		}
+	}
+
+	if (temp && !temp->empty() && temp->top_undo_no >= limit) {
+		if (!undo || undo->top_undo_no < temp->top_undo_no) {
+			undo = temp;
+			node->is_temp = true;
+		}
+	}
+
+	if (undo == NULL) {
+		trx_undo_try_truncate(*trx);
+		/* Mark any ROLLBACK TO SAVEPOINT completed, so that
+		if the transaction object is committed and reused
+		later, we will default to a full ROLLBACK. */
+		trx->roll_limit = 0;
+		trx->in_rollback = false;
+		return nullptr;
+	}
+
+	ut_ad(!undo->empty());
+	ut_ad(limit <= undo->top_undo_no);
+
+	node->roll_ptr = trx_undo_build_roll_ptr(
+		false, trx_sys.rseg_id(undo->rseg, !node->is_temp),
+		undo->top_page_no, undo->top_offset);
+
+	mtr_t	mtr;
+	mtr.start();
+
+	buf_block_t* undo_page = buf_page_get(
+		page_id_t(undo->rseg->space->id, undo->top_page_no),
+		0, RW_S_LATCH, &mtr);
+	if (!undo_page) {
+		return nullptr;
+	}
+
+	uint16_t offset = undo->top_offset;
+
+	buf_block_t* prev_page = undo_page;
+	if (trx_undo_rec_t* prev_rec = trx_undo_get_prev_rec(
+		    prev_page, offset, undo->hdr_page_no, undo->hdr_offset,
+		    true, &mtr)) {
+		if (prev_page != undo_page) {
+			trx->pages_undone++;
+		}
+
+		undo->top_page_no = prev_page->page.id().page_no();
+		undo->top_offset  = page_offset(prev_rec);
+		undo->top_undo_no = trx_undo_rec_get_undo_no(prev_rec);
+		ut_ad(!undo->empty());
+	} else {
+		undo->top_undo_no = IB_ID_MAX;
+		ut_ad(undo->empty());
+	}
+
+	undo_page->fix();
+	mtr.commit();
+
+	node->undo_rec = undo_page->page.frame + offset;
+
+	const size_t end = mach_read_from_2(node->undo_rec);
+	if (UNIV_UNLIKELY(end <= offset
+			  || end >= srv_page_size - FIL_PAGE_DATA_END)) {
+		undo_page->unfix();
+		node->undo_rec = nullptr;
+		return nullptr;
+	}
+
+	switch (node->undo_rec[2] & (TRX_UNDO_CMPL_INFO_MULT - 1)) {
+	case TRX_UNDO_INSERT_METADATA:
+		/* This record type was introduced in MDEV-11369
+		instant ADD COLUMN, which was implemented after
+		MDEV-12288 removed the insert_undo log. There is no
+		instant ADD COLUMN for temporary tables. Therefore,
+		this record can only be present in the main undo log. */
+		/* fall through */
+	case TRX_UNDO_RENAME_TABLE:
+		ut_ad(undo == update);
+		/* fall through */
+	case TRX_UNDO_INSERT_REC:
+	case TRX_UNDO_EMPTY:
+		node->roll_ptr |= 1ULL << ROLL_PTR_INSERT_FLAG_POS;
+	}
+
+	trx->undo_no = node->undo_no = trx_undo_rec_get_undo_no(
+		node->undo_rec);
+	return undo_page;
+}
+
+/***********************************************************//**
+Fetches an undo log record and does the undo for the recorded operation.
+If none left, or a partial rollback completed, returns control to the
+parent node, which is always a query thread node.
+@return DB_SUCCESS if operation successfully completed, else error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_undo(
+/*=====*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ut_ad(node->trx->in_rollback);
+
+	buf_block_t* undo_page = row_undo_rec_get(node);
+
+	if (!undo_page) {
+		/* Rollback completed for this query thread */
+		thr->run_node = que_node_get_parent(node);
+		return DB_SUCCESS;
+	}
+
+	dberr_t err = trx_undo_roll_ptr_is_insert(node->roll_ptr)
+		? row_undo_ins(node, thr) : row_undo_mod(node, thr);
+	undo_page->unfix();
+	btr_pcur_close(&(node->pcur));
+
+	mem_heap_empty(node->heap);
+
+	thr->run_node = node;
+
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a row operation in a table. This is a high-level function used
+in SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_undo_step(
+/*==========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t		err;
+	undo_node_t*	node;
+	trx_t*		trx = thr_get_trx(thr);
+
+	node = static_cast<undo_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_UNDO);
+
+	if (UNIV_UNLIKELY(!trx->dict_operation
+			  && !srv_undo_sources
+			  && srv_shutdown_state != SRV_SHUTDOWN_NONE)
+	    && (srv_fast_shutdown == 3 || trx == trx_roll_crash_recv_trx)) {
+		/* Shutdown has been initiated. */
+		trx->error_state = DB_INTERRUPTED;
+		return NULL;
+	}
+
+	if (UNIV_UNLIKELY(trx == trx_roll_crash_recv_trx)) {
+		trx_roll_report_progress();
+	}
+
+	err = row_undo(node, thr);
+
+#ifdef ENABLED_DEBUG_SYNC
+	if (trx->mysql_thd) {
+		DEBUG_SYNC_C("trx_after_rollback_row");
+	}
+#endif /* ENABLED_DEBUG_SYNC */
+
+	trx->error_state = err;
+
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		ib::fatal() << "Error (" << err << ") in rollback.";
+	}
+
+	return(thr);
+}
diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc
new file mode 100644
index 00000000..bec53841
--- /dev/null
+++ b/storage/innobase/row/row0upd.cc
@@ -0,0 +1,3002 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0upd.cc
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0upd.h"
+#include "dict0dict.h"
+#include "dict0mem.h"
+#include "trx0undo.h"
+#include "rem0rec.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "mach0data.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "que0que.h"
+#include "row0ext.h"
+#include "row0ins.h"
+#include "row0log.h"
+#include "row0row.h"
+#include "row0sel.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "pars0sym.h"
+#include "eval0eval.h"
+#include "buf0lru.h"
+#include "trx0rec.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#include <algorithm>
+#include <mysql/plugin.h>
+#include <mysql/service_wsrep.h>
+#ifdef WITH_WSREP
+#include "log.h"
+#include "wsrep.h"
+#endif /* WITH_WSREP */
+
+
+/* What kind of latch and lock can we assume when the control comes to
+   -------------------------------------------------------------------
+an update node?
+--------------
+Efficiency of massive updates would require keeping an x-latch on a
+clustered index page through many updates, and not setting an explicit
+x-lock on clustered index records, as they anyway will get an implicit
+x-lock when they are updated. A problem is that the read nodes in the
+graph should know that they must keep the latch when passing the control
+up to the update node, and not set any record lock on the record which
+will be updated. Another problem occurs if the execution is stopped,
+as the kernel switches to another query thread, or the transaction must
+wait for a lock. Then we should be able to release the latch and, maybe,
+acquire an explicit x-lock on the record.
+	Because this seems too complicated, we conclude that the less
+efficient solution of releasing all the latches when the control is
+transferred to another node, and acquiring explicit x-locks, is better. */
+
+/* How is a delete performed? If there is a delete without an
+explicit cursor, i.e., a searched delete, there are at least
+two different situations:
+the implicit select cursor may run on (1) the clustered index or
+on (2) a secondary index. The delete is performed by setting
+the delete bit in the record and substituting the id of the
+deleting transaction for the original trx id, and substituting a
+new roll ptr for previous roll ptr. The old trx id and roll ptr
+are saved in the undo log record. Thus, no physical changes occur
+in the index tree structure at the time of the delete. Only
+when the undo log is purged, the index records will be physically
+deleted from the index trees.
+
+The query graph executing a searched delete would consist of
+a delete node which has as a subtree a select subgraph.
+The select subgraph should return a (persistent) cursor
+in the clustered index, placed on page which is x-latched.
+The delete node should look for all secondary index records for
+this clustered index entry and mark them as deleted. When is
+the x-latch freed? The most efficient way for performing a
+searched delete is obviously to keep the x-latch for several
+steps of query graph execution. */
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***********************************************************//**
+Checks if an update vector changes some of the first ordering fields of an
+index record. This is only used in foreign key checks and we can assume
+that index does not contain column prefixes.
+@return TRUE if changes */
+static
+ibool
+row_upd_changes_first_fields_binary(
+/*================================*/
+	dtuple_t*	entry,	/*!< in: old value of index entry */
+	dict_index_t*	index,	/*!< in: index of entry */
+	const upd_t*	update,	/*!< in: update vector for the row */
+	ulint		n);	/*!< in: how many first fields to check */
+
+/*********************************************************************//**
+Checks if index currently is mentioned as a referenced index in a foreign
+key constraint.
+
+@return true if referenced */
+static
+bool
+row_upd_index_is_referenced(
+/*========================*/
+	dict_index_t*	index,	/*!< in: index */
+	trx_t*		trx)	/*!< in: transaction */
+{
+  dict_table_t *table= index->table;
+  /* The pointers in table->referenced_set are safe to dereference
+  thanks to the SQL layer having acquired MDL on all (grand)parent tables. */
+  dict_foreign_set::iterator end= table->referenced_set.end();
+  return end != std::find_if(table->referenced_set.begin(), end,
+                             dict_foreign_with_index(index));
+}
+
+#ifdef WITH_WSREP
+static
+bool
+wsrep_row_upd_index_is_foreign(
+/*========================*/
+	dict_index_t*	index,	/*!< in: index */
+	trx_t*		trx)	/*!< in: transaction */
+{
+  if (!trx->is_wsrep())
+    return false;
+
+  dict_table_t *table= index->table;
+
+  if (table->foreign_set.empty())
+    return false;
+
+  /* No MDL protects dereferencing the members of table->foreign_set. */
+  const bool no_lock= !trx->dict_operation_lock_mode;
+  if (no_lock)
+    dict_sys.freeze(SRW_LOCK_CALL);
+
+  auto end= table->foreign_set.end();
+  const bool is_referenced= end !=
+    std::find_if(table->foreign_set.begin(), end,
+                 [index](const dict_foreign_t* f)
+                 {return f->foreign_index == index;});
+  if (no_lock)
+    dict_sys.unfreeze();
+
+  return is_referenced;
+}
+#endif /* WITH_WSREP */
+
+/*********************************************************************//**
+Checks if possible foreign key constraints hold after a delete of the record
+under pcur.
+
+NOTE that this function will temporarily commit mtr and lose the
+pcur position!
+
+@return DB_SUCCESS or an error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_check_references_constraints(
+/*=================================*/
+	upd_node_t*	node,	/*!< in: row update node */
+	btr_pcur_t*	pcur,	/*!< in: cursor positioned on a record; NOTE: the
+				cursor position is lost in this function! */
+	dict_table_t*	table,	/*!< in: table in question */
+	dict_index_t*	index,	/*!< in: index of the cursor */
+	rec_offs*	offsets,/*!< in/out: rec_get_offsets(pcur.rec, index) */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_foreign_t*	foreign;
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	const rec_t*	rec;
+	dberr_t		err;
+
+	DBUG_ENTER("row_upd_check_references_constraints");
+
+	if (table->referenced_set.empty()) {
+		DBUG_RETURN(DB_SUCCESS);
+	}
+
+	rec = btr_pcur_get_rec(pcur);
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	heap = mem_heap_create(500);
+
+	entry = row_rec_to_index_entry(rec, index, offsets, heap);
+
+	mtr_commit(mtr);
+
+	DEBUG_SYNC_C("foreign_constraint_check_for_update");
+
+	mtr->start();
+
+	DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+			    "foreign_constraint_check_for_insert");
+
+	for (dict_foreign_set::iterator it = table->referenced_set.begin();
+	     it != table->referenced_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		/* Note that we may have an update which updates the index
+		record, but does NOT update the first fields which are
+		referenced in a foreign key constraint. Then the update does
+		NOT break the constraint. */
+
+		if (foreign->referenced_index == index
+		    && (node->is_delete
+			|| row_upd_changes_first_fields_binary(
+				entry, index, node->update,
+				foreign->n_fields))) {
+			dict_table_t*	ref_table = nullptr;
+
+			if (!foreign->foreign_table) {
+				ref_table = dict_table_open_on_name(
+					foreign->foreign_table_name_lookup,
+					false, DICT_ERR_IGNORE_NONE);
+			}
+
+			err = row_ins_check_foreign_constraint(
+				FALSE, foreign, table, entry, thr);
+
+			if (ref_table) {
+				dict_table_close(ref_table);
+			}
+
+			if (err != DB_SUCCESS) {
+				goto func_exit;
+			}
+		}
+	}
+
+	err = DB_SUCCESS;
+
+func_exit:
+	mem_heap_free(heap);
+
+	DEBUG_SYNC_C("foreign_constraint_check_for_update_done");
+	DBUG_RETURN(err);
+}
+
+#ifdef WITH_WSREP
+static
+dberr_t
+wsrep_row_upd_check_foreign_constraints(
+/*=================================*/
+	upd_node_t*	node,	/*!< in: row update node */
+	btr_pcur_t*	pcur,	/*!< in: cursor positioned on a record; NOTE: the
+				cursor position is lost in this function! */
+	dict_table_t*	table,	/*!< in: table in question */
+	dict_index_t*	index,	/*!< in: index of the cursor */
+	rec_offs*	offsets,/*!< in/out: rec_get_offsets(pcur.rec, index) */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_foreign_t*	foreign;
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	const rec_t*	rec;
+	dberr_t		err;
+
+	if (table->foreign_set.empty()) {
+		return(DB_SUCCESS);
+	}
+
+	/* TODO: make native slave thread bail out here */
+
+	rec = btr_pcur_get_rec(pcur);
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	heap = mem_heap_create(500);
+
+	entry = row_rec_to_index_entry(rec, index, offsets, heap);
+
+	mtr_commit(mtr);
+
+	mtr_start(mtr);
+
+	for (dict_foreign_set::iterator it = table->foreign_set.begin();
+	     it != table->foreign_set.end();
+	     ++it) {
+
+		foreign = *it;
+		/* Note that we may have an update which updates the index
+		record, but does NOT update the first fields which are
+		referenced in a foreign key constraint. Then the update does
+		NOT break the constraint. */
+
+		if (foreign->foreign_index == index
+		    && (node->is_delete
+			|| row_upd_changes_first_fields_binary(
+				entry, index, node->update,
+				foreign->n_fields))) {
+
+			dict_table_t *opened = nullptr;
+
+			if (!foreign->referenced_table) {
+				foreign->referenced_table =
+					dict_table_open_on_name(
+					  foreign->referenced_table_name_lookup,
+					  false, DICT_ERR_IGNORE_NONE);
+				opened = foreign->referenced_table;
+			}
+
+			err = row_ins_check_foreign_constraint(
+				TRUE, foreign, table, entry, thr);
+
+			if (opened) {
+				dict_table_close(opened);
+			}
+
+			if (err != DB_SUCCESS) {
+				goto func_exit;
+			}
+		}
+	}
+
+	err = DB_SUCCESS;
+func_exit:
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/** Determine if a FOREIGN KEY constraint needs to be processed.
+@param[in]	node	query node
+@param[in]	trx	transaction
+@return	whether the node cannot be ignored */
+
+inline bool wsrep_must_process_fk(const upd_node_t* node, const trx_t* trx)
+{
+	if (!trx->is_wsrep()) {
+		return false;
+	}
+	return que_node_get_type(node->common.parent) != QUE_NODE_UPDATE
+		|| static_cast<upd_node_t*>(node->common.parent)->cascade_node
+		!= node;
+}
+#endif /* WITH_WSREP */
+
+/*********************************************************************//**
+Creates an update node for a query graph.
+@return own: update node */
+upd_node_t*
+upd_node_create(
+/*============*/
+	mem_heap_t*	heap)	/*!< in: mem heap where created */
+{
+	upd_node_t*	node;
+
+	node = static_cast<upd_node_t*>(
+		mem_heap_zalloc(heap, sizeof(upd_node_t)));
+
+	node->common.type = QUE_NODE_UPDATE;
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+	node->heap = mem_heap_create(128);
+	node->magic_n = UPD_NODE_MAGIC_N;
+
+	return(node);
+}
+
+/***********************************************************//**
+Returns TRUE if row update changes size of some field in index or if some
+field to be updated is stored externally in rec or update.
+@return TRUE if the update changes the size of some field in index or
+the field is external in rec or update */
+ibool
+row_upd_changes_field_size_or_external(
+/*===================================*/
+	dict_index_t*	index,	/*!< in: index */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update)	/*!< in: update vector */
+{
+	const upd_field_t*	upd_field;
+	const dfield_t*		new_val;
+	ulint			old_len;
+	ulint			new_len;
+	ulint			n_fields;
+	ulint			i;
+
+	ut_ad(rec_offs_validate(NULL, index, offsets));
+	ut_ad(!index->table->skip_alter_undo);
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		upd_field = upd_get_nth_field(update, i);
+
+		/* We should ignore virtual field if the index is not
+		a virtual index */
+		if (upd_fld_is_virtual_col(upd_field)
+		    && !index->has_virtual()) {
+			continue;
+		}
+
+		new_val = &(upd_field->new_val);
+		if (dfield_is_ext(new_val)) {
+			return(TRUE);
+		}
+		new_len = dfield_get_len(new_val);
+		ut_ad(new_len != UNIV_SQL_DEFAULT);
+
+		if (dfield_is_null(new_val) && !rec_offs_comp(offsets)) {
+			new_len = dict_col_get_sql_null_size(
+				dict_index_get_nth_col(index,
+						       upd_field->field_no),
+				0);
+		}
+
+		if (rec_offs_nth_default(offsets, upd_field->field_no)) {
+			/* This is an instantly added column that is
+			at the initial default value. */
+			return(TRUE);
+		}
+
+		if (rec_offs_comp(offsets)
+		    && rec_offs_nth_sql_null(offsets, upd_field->field_no)) {
+			/* Note that in the compact table format, for a
+			variable length field, an SQL NULL will use zero
+			bytes in the offset array at the start of the physical
+			record, but a zero-length value (empty string) will
+			use one byte! Thus, we cannot use update-in-place
+			if we update an SQL NULL varchar to an empty string! */
+
+			old_len = UNIV_SQL_NULL;
+		} else {
+			old_len = rec_offs_nth_size(offsets,
+						    upd_field->field_no);
+		}
+
+		if (old_len != new_len
+		    || rec_offs_nth_extern(offsets, upd_field->field_no)) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/***************************************************************//**
+Builds an update vector from those fields which in a secondary index entry
+differ from a record that has the equal ordering fields. NOTE: we compare
+the fields as binary strings!
+@return own: update vector of differing fields */
+upd_t*
+row_upd_build_sec_rec_difference_binary(
+/*====================================*/
+	const rec_t*	rec,	/*!< in: secondary index record */
+	dict_index_t*	index,	/*!< in: index */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
+{
+	upd_field_t*	upd_field;
+	const dfield_t*	dfield;
+	const byte*	data;
+	ulint		len;
+	upd_t*		update;
+	ulint		n_diff;
+
+	/* This function is used only for a secondary index */
+	ut_a(!dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(rec_offs_n_fields(offsets) == dtuple_get_n_fields(entry));
+	ut_ad(!rec_offs_any_extern(offsets));
+	ut_ad(!rec_offs_any_default(offsets));
+	ut_ad(!index->table->skip_alter_undo);
+
+	update = upd_create(dtuple_get_n_fields(entry), heap);
+
+	n_diff = 0;
+
+	for (uint16_t i = 0; i < dtuple_get_n_fields(entry); i++) {
+
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		dfield = dtuple_get_nth_field(entry, i);
+
+		/* NOTE that it may be that len != dfield_get_len(dfield) if we
+		are updating in a character set and collation where strings of
+		different length can be equal in an alphabetical comparison,
+		and also in the case where we have a column prefix index
+		and the last characters in the index field are spaces; the
+		latter case probably caused the assertion failures reported at
+		row0upd.cc line 713 in versions 4.0.14 - 4.0.16. */
+
+		/* NOTE: we compare the fields as binary strings!
+		(No collation) */
+
+		if (!dfield_data_is_binary_equal(dfield, len, data)) {
+
+			upd_field = upd_get_nth_field(update, n_diff);
+
+			dfield_copy(&(upd_field->new_val), dfield);
+
+			upd_field_set_field_no(upd_field, i, index);
+
+			n_diff++;
+		}
+	}
+
+	update->n_fields = n_diff;
+
+	return(update);
+}
+
+
+/** Builds an update vector from those fields, excluding the roll ptr and
+trx id fields, which in an index entry differ from a record that has
+the equal ordering fields. NOTE: we compare the fields as binary strings!
+@param[in]	index		clustered index
+@param[in]	entry		clustered index entry to insert
+@param[in]	rec		clustered index record
+@param[in]	offsets		rec_get_offsets(rec,index), or NULL
+@param[in]	no_sys		skip the system columns
+				DB_TRX_ID and DB_ROLL_PTR
+@param[in]	trx		transaction (for diagnostics),
+				or NULL
+@param[in]	heap		memory heap from which allocated
+@param[in]	mysql_table	NULL, or mysql table object when
+				user thread invokes dml
+@param[out]	error		error number in case of failure
+@return own: update vector of differing fields, excluding roll ptr and
+trx id,if error is not equal to DB_SUCCESS, return NULL */
+upd_t*
+row_upd_build_difference_binary(
+	dict_index_t*	index,
+	const dtuple_t*	entry,
+	const rec_t*	rec,
+	const rec_offs*	offsets,
+	bool		no_sys,
+	bool		ignore_warnings,
+	trx_t*		trx,
+	mem_heap_t*	heap,
+	TABLE*		mysql_table,
+	dberr_t*	error)
+{
+	ulint		len;
+	upd_t*		update;
+	ulint		n_diff;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	const ulint	n_v_fld = dtuple_get_n_v_fields(entry);
+	rec_offs_init(offsets_);
+
+	/* This function is used only for a clustered index */
+	ut_a(dict_index_is_clust(index));
+	ut_ad(!index->table->skip_alter_undo);
+	ut_ad(entry->n_fields <= index->n_fields);
+	ut_ad(entry->n_fields >= index->n_core_fields);
+
+	update = upd_create(index->n_fields + n_v_fld, heap);
+
+	n_diff = 0;
+
+	if (!offsets) {
+		offsets = rec_get_offsets(rec, index, offsets_,
+					  index->n_core_fields,
+					  ULINT_UNDEFINED, &heap);
+	} else {
+		ut_ad(rec_offs_validate(rec, index, offsets));
+	}
+
+	for (uint16_t i = 0; i < entry->n_fields; i++) {
+		const byte* data = rec_get_nth_cfield(rec, index, offsets, i,
+						      &len);
+		const dfield_t* dfield = dtuple_get_nth_field(entry, i);
+
+		/* NOTE: we compare the fields as binary strings!
+		(No collation) */
+		if (no_sys && (i == index->db_trx_id()
+			       || i == index->db_roll_ptr())) {
+			continue;
+		}
+
+		if (!dfield_is_ext(dfield)
+		    != !rec_offs_nth_extern(offsets, i)
+		    || !dfield_data_is_binary_equal(dfield, len, data)) {
+			upd_field_t* uf = upd_get_nth_field(update, n_diff++);
+			dfield_copy(&uf->new_val, dfield);
+			upd_field_set_field_no(uf, i, index);
+		}
+	}
+
+	for (uint16_t i = static_cast<uint16_t>(entry->n_fields);
+	     i < index->n_fields; i++) {
+		upd_field_t* uf = upd_get_nth_field(update, n_diff++);
+		const dict_col_t* col = dict_index_get_nth_col(index, i);
+		/* upd_create() zero-initialized uf */
+		uf->new_val.data = const_cast<byte*>(col->instant_value(&len));
+		uf->new_val.len = static_cast<unsigned>(len);
+		dict_col_copy_type(col, &uf->new_val.type);
+		upd_field_set_field_no(uf, i, index);
+	}
+
+	/* Check the virtual columns updates. Even if there is no non-virtual
+	column (base columns) change, we will still need to build the
+	indexed virtual column value so that undo log would log them (
+	for purge/mvcc purpose) */
+	if (n_v_fld > 0) {
+		row_ext_t*	ext;
+		THD*		thd;
+
+		if (trx == NULL) {
+			thd = current_thd;
+		} else {
+			thd = trx->mysql_thd;
+		}
+
+		ut_ad(!update->old_vrow);
+
+		ib_vcol_row vc(NULL);
+		uchar *record = vc.record(thd, index, &mysql_table);
+
+		for (uint16_t i = 0; i < n_v_fld; i++) {
+			const dict_v_col_t*     col
+                                = dict_table_get_nth_v_col(index->table, i);
+
+			if (!col->m_col.ord_part) {
+				continue;
+			}
+
+			if (update->old_vrow == NULL) {
+				update->old_vrow = row_build(
+					ROW_COPY_POINTERS, index, rec, offsets,
+					index->table, NULL, NULL, &ext, heap);
+			}
+
+			dfield_t*	vfield = innobase_get_computed_value(
+				update->old_vrow, col, index,
+				&vc.heap, heap, NULL, thd, mysql_table, record,
+				NULL, NULL, ignore_warnings);
+			if (vfield == NULL) {
+				*error = DB_COMPUTE_VALUE_FAILED;
+				return(NULL);
+			}
+
+			const dfield_t* dfield = dtuple_get_nth_v_field(
+				entry, i);
+
+			if (!dfield_data_is_binary_equal(
+				    dfield, vfield->len,
+				    static_cast<byte*>(vfield->data))) {
+				upd_field_t* uf = upd_get_nth_field(update,
+								    n_diff++);
+				uf->old_v_val = static_cast<dfield_t*>(
+					mem_heap_alloc(heap,
+						       sizeof *uf->old_v_val));
+				dfield_copy(uf->old_v_val, vfield);
+				dfield_copy(&uf->new_val, dfield);
+				upd_field_set_v_field_no(uf, i, index);
+			}
+		}
+	}
+
+	update->n_fields = n_diff;
+	ut_ad(update->validate());
+
+	return(update);
+}
+
+/** Fetch a prefix of an externally stored column.
+This is similar to row_ext_lookup(), but the row_ext_t holds the old values
+of the column and must not be poisoned with the new values.
+@param[in]	data		'internally' stored part of the field
+containing also the reference to the external part
+@param[in]	local_len	length of data, in bytes
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]	len		input - length of prefix to
+fetch; output: fetched length of the prefix
+@param[in,out]	heap		heap where to allocate
+@return BLOB prefix
+@retval NULL if the record is incomplete (should only happen
+in row_vers_vc_matches_cluster() executed concurrently with another purge) */
+static
+byte*
+row_upd_ext_fetch(
+	const byte*		data,
+	ulint			local_len,
+	ulint			zip_size,
+	ulint*			len,
+	mem_heap_t*		heap)
+{
+	byte*	buf = static_cast<byte*>(mem_heap_alloc(heap, *len));
+
+	*len = btr_copy_externally_stored_field_prefix(
+		buf, *len, zip_size, data, local_len);
+
+	return *len ? buf : NULL;
+}
+
+/** Replaces the new column value stored in the update vector in
+the given index entry field.
+@param[in,out]	dfield		data field of the index entry
+@param[in]	field		index field
+@param[in]	col		field->col
+@param[in]	uf		update field
+@param[in,out]	heap		memory heap for allocating and copying
+the new value
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@return whether the previous version was built successfully */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+static
+bool
+row_upd_index_replace_new_col_val(
+	dfield_t*		dfield,
+	const dict_field_t*	field,
+	const dict_col_t*	col,
+	const upd_field_t*	uf,
+	mem_heap_t*		heap,
+	ulint			zip_size)
+{
+	ulint		len;
+	const byte*	data;
+
+	dfield_copy_data(dfield, &uf->new_val);
+
+	if (dfield_is_null(dfield)) {
+		return true;
+	}
+
+	len = dfield_get_len(dfield);
+	data = static_cast<const byte*>(dfield_get_data(dfield));
+
+	if (field->prefix_len > 0) {
+		ibool		fetch_ext = dfield_is_ext(dfield)
+			&& len < (ulint) field->prefix_len
+			+ BTR_EXTERN_FIELD_REF_SIZE;
+
+		if (fetch_ext) {
+			ulint	l = len;
+
+			len = field->prefix_len;
+
+			data = row_upd_ext_fetch(data, l, zip_size,
+						 &len, heap);
+			if (UNIV_UNLIKELY(!data)) {
+				return false;
+			}
+		}
+
+		len = dtype_get_at_most_n_mbchars(col->prtype,
+						  col->mbminlen, col->mbmaxlen,
+						  field->prefix_len, len,
+						  (const char*) data);
+
+		dfield_set_data(dfield, data, len);
+
+		if (!fetch_ext) {
+			dfield_dup(dfield, heap);
+		}
+
+		return true;
+	}
+
+	switch (uf->orig_len) {
+		byte*	buf;
+	case BTR_EXTERN_FIELD_REF_SIZE:
+		/* Restore the original locally stored
+		part of the column.  In the undo log,
+		InnoDB writes a longer prefix of externally
+		stored columns, so that column prefixes
+		in secondary indexes can be reconstructed. */
+		dfield_set_data(dfield,
+				data + len - BTR_EXTERN_FIELD_REF_SIZE,
+				BTR_EXTERN_FIELD_REF_SIZE);
+		dfield_set_ext(dfield);
+		/* fall through */
+	case 0:
+		dfield_dup(dfield, heap);
+		break;
+	default:
+		/* Reconstruct the original locally
+		stored part of the column.  The data
+		will have to be copied. */
+		ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
+		buf = static_cast<byte*>(mem_heap_alloc(heap, uf->orig_len));
+
+		/* Copy the locally stored prefix. */
+		memcpy(buf, data,
+		       unsigned(uf->orig_len) - BTR_EXTERN_FIELD_REF_SIZE);
+
+		/* Copy the BLOB pointer. */
+		memcpy(buf + uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE,
+		       data + len - BTR_EXTERN_FIELD_REF_SIZE,
+		       BTR_EXTERN_FIELD_REF_SIZE);
+
+		dfield_set_data(dfield, buf, uf->orig_len);
+		dfield_set_ext(dfield);
+		break;
+	}
+
+	return true;
+}
+
+/** Apply an update vector to an metadata entry.
+@param[in,out]	entry	clustered index metadata record to be updated
+@param[in]	index	index of the entry
+@param[in]	update	update vector built for the entry
+@param[in,out]	heap	memory heap for copying off-page columns */
+static
+void
+row_upd_index_replace_metadata(
+	dtuple_t*		entry,
+	const dict_index_t*	index,
+	const upd_t*		update,
+	mem_heap_t*		heap)
+{
+	ut_ad(!index->table->skip_alter_undo);
+	ut_ad(update->is_alter_metadata());
+	ut_ad(entry->info_bits == update->info_bits);
+	ut_ad(entry->n_fields == ulint(index->n_fields) + 1);
+	const ulint zip_size = index->table->space->zip_size();
+	const ulint first = index->first_user_field();
+	ut_d(bool found_mblob = false);
+
+	for (ulint i = upd_get_n_fields(update); i--; ) {
+		const upd_field_t* uf = upd_get_nth_field(update, i);
+		ut_ad(!upd_fld_is_virtual_col(uf));
+		ut_ad(uf->field_no >= first - 2);
+		ulint f = uf->field_no;
+		dfield_t* dfield = dtuple_get_nth_field(entry, f);
+
+		if (f == first) {
+			ut_d(found_mblob = true);
+			ut_ad(!dfield_is_null(&uf->new_val));
+			ut_ad(dfield_is_ext(dfield));
+			ut_ad(dfield_get_len(dfield) == FIELD_REF_SIZE);
+			ut_ad(!dfield_is_null(dfield));
+			dfield_set_data(dfield, uf->new_val.data,
+					uf->new_val.len);
+			if (dfield_is_ext(&uf->new_val)) {
+				dfield_set_ext(dfield);
+			}
+			continue;
+		}
+
+		f -= f > first;
+		const dict_field_t* field = dict_index_get_nth_field(index, f);
+		if (!row_upd_index_replace_new_col_val(dfield, field,
+						       field->col,
+						       uf, heap, zip_size)) {
+			ut_error;
+		}
+	}
+
+	ut_ad(found_mblob);
+}
+
+/** Apply an update vector to an index entry.
+@param[in,out]	entry	index entry to be updated; the clustered index record
+			must be covered by a lock or a page latch to prevent
+			deletion (rollback or purge)
+@param[in]	index	index of the entry
+@param[in]	update	update vector built for the entry
+@param[in,out]	heap	memory heap for copying off-page columns */
+void
+row_upd_index_replace_new_col_vals_index_pos(
+	dtuple_t*		entry,
+	const dict_index_t*	index,
+	const upd_t*		update,
+	mem_heap_t*		heap)
+{
+	ut_ad(!index->table->skip_alter_undo);
+	ut_ad(!entry->is_metadata() || entry->info_bits == update->info_bits);
+
+	if (UNIV_UNLIKELY(entry->is_alter_metadata())) {
+		row_upd_index_replace_metadata(entry, index, update, heap);
+		return;
+	}
+
+	const ulint zip_size = index->table->space->zip_size();
+
+	dtuple_set_info_bits(entry, update->info_bits);
+
+	for (uint16_t i = index->n_fields; i--; ) {
+		const dict_field_t*	field;
+		const dict_col_t*	col;
+		const upd_field_t*	uf;
+
+		field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(field);
+		if (col->is_virtual()) {
+			const dict_v_col_t*	vcol = reinterpret_cast<
+							const dict_v_col_t*>(
+								col);
+
+			uf = upd_get_field_by_field_no(
+				update, vcol->v_pos, true);
+		} else {
+			uf = upd_get_field_by_field_no(
+				update, i, false);
+		}
+
+		if (uf && UNIV_UNLIKELY(!row_upd_index_replace_new_col_val(
+						dtuple_get_nth_field(entry, i),
+						field, col, uf, heap,
+						zip_size))) {
+			ut_error;
+		}
+	}
+}
+
+/** Replace the new column values stored in the update vector,
+during trx_undo_prev_version_build().
+@param entry   clustered index tuple where the values are replaced
+               (the clustered index leaf page latch must be held)
+@param index   clustered index
+@param update  update vector for the clustered index
+@param heap    memory heap for allocating and copying values
+@return whether the previous version was built successfully */
+bool
+row_upd_index_replace_new_col_vals(dtuple_t *entry, const dict_index_t &index,
+                                   const upd_t *update, mem_heap_t *heap)
+{
+  ut_ad(index.is_primary());
+  const ulint zip_size= index.table->space->zip_size();
+
+  ut_ad(!index.table->skip_alter_undo);
+  dtuple_set_info_bits(entry, update->info_bits);
+
+  for (ulint i= 0; i < index.n_fields; i++)
+  {
+   const dict_field_t *field= &index.fields[i];
+   const dict_col_t* col= dict_field_get_col(field);
+   const upd_field_t *uf;
+
+   if (col->is_virtual())
+   {
+     const dict_v_col_t *vcol= reinterpret_cast<const dict_v_col_t*>(col);
+     uf= upd_get_field_by_field_no(update, vcol->v_pos, true);
+   }
+   else
+     uf= upd_get_field_by_field_no(update, static_cast<uint16_t>
+                                   (dict_col_get_clust_pos(col, &index)),
+                                   false);
+
+   if (!uf)
+     continue;
+
+   if (!row_upd_index_replace_new_col_val(dtuple_get_nth_field(entry, i),
+                                          field, col, uf, heap, zip_size))
+     return false;
+  }
+
+  return true;
+}
+
+/** Replaces the virtual column values stored in the update vector.
+@param[in,out]	row	row whose column to be set
+@param[in]	field	data to set
+@param[in]	len	data length
+@param[in]	vcol	virtual column info */
+static
+void
+row_upd_set_vcol_data(
+	dtuple_t*		row,
+	const byte*             field,
+	ulint                   len,
+	dict_v_col_t*		vcol)
+{
+	dfield_t*	dfield = dtuple_get_nth_v_field(row, vcol->v_pos);
+
+	if (dfield_get_type(dfield)->mtype == DATA_MISSING) {
+		dict_col_copy_type(&vcol->m_col, dfield_get_type(dfield));
+
+		dfield_set_data(dfield, field, len);
+	}
+}
+
+/** Replaces the virtual column values stored in a dtuple with that of
+a update vector.
+@param[in,out]	row	row whose column to be updated
+@param[in]	table	table
+@param[in]	update	an update vector built for the clustered index
+@param[in]	upd_new	update to new or old value
+@param[in,out]	undo_row undo row (if needs to be updated)
+@param[in]	ptr	remaining part in update undo log */
+void
+row_upd_replace_vcol(
+	dtuple_t*		row,
+	const dict_table_t*	table,
+	const upd_t*		update,
+	bool			upd_new,
+	dtuple_t*		undo_row,
+	const byte*		ptr)
+{
+	ulint			col_no;
+	ulint			i;
+	ulint			n_cols;
+
+	ut_ad(!table->skip_alter_undo);
+
+	n_cols = dtuple_get_n_v_fields(row);
+	for (col_no = 0; col_no < n_cols; col_no++) {
+		dfield_t*		dfield;
+
+		const dict_v_col_t*	col
+			= dict_table_get_nth_v_col(table, col_no);
+
+		/* If there is no index on the column, do not bother for
+		value update */
+		if (!col->m_col.ord_part) {
+			continue;
+		}
+
+		dfield = dtuple_get_nth_v_field(row, col_no);
+
+		for (i = 0; i < upd_get_n_fields(update); i++) {
+			const upd_field_t*	upd_field
+				= upd_get_nth_field(update, i);
+			if (!upd_fld_is_virtual_col(upd_field)
+			    || upd_field->field_no != col->v_pos) {
+				continue;
+			}
+
+			if (upd_new) {
+				dfield_copy_data(dfield, &upd_field->new_val);
+			} else {
+				dfield_copy_data(dfield, upd_field->old_v_val);
+			}
+
+			dfield->type = upd_field->new_val.type;
+			break;
+		}
+	}
+
+	bool	first_v_col = true;
+	bool	is_undo_log = true;
+
+	/* We will read those unchanged (but indexed) virtual columns in */
+	if (ptr) {
+		const byte* const end_ptr = ptr + mach_read_from_2(ptr);
+		ptr += 2;
+
+		while (ptr != end_ptr) {
+			const byte* field;
+			uint32_t field_no, len, orig_len;
+
+			field_no = mach_read_next_compressed(&ptr);
+
+			const bool is_v = (field_no >= REC_MAX_N_FIELDS);
+
+			if (is_v) {
+				ptr = trx_undo_read_v_idx(
+					table, ptr, first_v_col, &is_undo_log,
+					&field_no);
+				first_v_col = false;
+			}
+
+			ptr = trx_undo_rec_get_col_val(
+				ptr, &field, &len, &orig_len);
+
+			if (field_no == FIL_NULL) {
+				ut_ad(is_v);
+				continue;
+			}
+
+			if (is_v) {
+				dict_v_col_t* vcol = dict_table_get_nth_v_col(
+							table, field_no);
+
+				row_upd_set_vcol_data(row, field, len, vcol);
+
+				if (undo_row) {
+					row_upd_set_vcol_data(
+						undo_row, field, len, vcol);
+				}
+			}
+			ut_ad(ptr<= end_ptr);
+		}
+	}
+}
+
+/***********************************************************//**
+Replaces the new column values stored in the update vector. */
+void
+row_upd_replace(
+/*============*/
+	dtuple_t*		row,	/*!< in/out: row where replaced,
+					indexed by col_no;
+					the clustered index record must be
+					covered by a lock or a page latch to
+					prevent deletion (rollback or purge) */
+	row_ext_t**		ext,	/*!< out, own: NULL, or externally
+					stored column prefixes */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const upd_t*		update,	/*!< in: an update vector built for the
+					clustered index */
+	mem_heap_t*		heap)	/*!< in: memory heap */
+{
+	ulint			col_no;
+	ulint			i;
+	ulint			n_cols;
+	ulint			n_ext_cols;
+	ulint*			ext_cols;
+	const dict_table_t*	table;
+
+	ut_ad(row);
+	ut_ad(ext);
+	ut_ad(index);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(update);
+	ut_ad(heap);
+	ut_ad(update->validate());
+
+	n_cols = dtuple_get_n_fields(row);
+	table = index->table;
+	ut_ad(n_cols == dict_table_get_n_cols(table));
+
+	ext_cols = static_cast<ulint*>(
+		mem_heap_alloc(heap, n_cols * sizeof *ext_cols));
+
+	n_ext_cols = 0;
+
+	dtuple_set_info_bits(row, update->info_bits);
+
+	for (col_no = 0; col_no < n_cols; col_no++) {
+
+		const dict_col_t*	col
+			= dict_table_get_nth_col(table, col_no);
+		const ulint		clust_pos
+			= dict_col_get_clust_pos(col, index);
+		dfield_t*		dfield;
+
+		if (UNIV_UNLIKELY(clust_pos == ULINT_UNDEFINED)) {
+
+			continue;
+		}
+
+		dfield = dtuple_get_nth_field(row, col_no);
+
+		for (i = 0; i < upd_get_n_fields(update); i++) {
+
+			const upd_field_t*	upd_field
+				= upd_get_nth_field(update, i);
+
+			if (upd_field->field_no != clust_pos
+			    || upd_fld_is_virtual_col(upd_field)) {
+
+				continue;
+			}
+
+			dfield_copy_data(dfield, &upd_field->new_val);
+			break;
+		}
+
+		if (dfield_is_ext(dfield) && col->ord_part) {
+			ext_cols[n_ext_cols++] = col_no;
+		}
+	}
+
+	if (n_ext_cols) {
+		*ext = row_ext_create(n_ext_cols, ext_cols, *table, row, heap);
+	} else {
+		*ext = NULL;
+	}
+
+	row_upd_replace_vcol(row, table, update, true, nullptr, nullptr);
+}
+
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector changes an ordering field in the index record */
+ibool
+row_upd_changes_ord_field_binary_func(
+/*==================================*/
+	dict_index_t*	index,	/*!< in: index of the record */
+	const upd_t*	update,	/*!< in: update vector for the row; NOTE: the
+				field numbers in this MUST be clustered index
+				positions! */
+#ifdef UNIV_DEBUG
+	const que_thr_t*thr,	/*!< in: query thread */
+#endif /* UNIV_DEBUG */
+	const dtuple_t*	row,	/*!< in: old value of row, or NULL if the
+				row and the data values in update are not
+				known when this function is called, e.g., at
+				compile time */
+	const row_ext_t*ext,	/*!< NULL, or prefixes of the externally
+				stored columns in the old row */
+	ulint		flag)	/*!< in: ROW_BUILD_NORMAL,
+				ROW_BUILD_FOR_PURGE or ROW_BUILD_FOR_UNDO */
+{
+	ulint			n_unique;
+	ulint			i;
+	const dict_index_t*	clust_index;
+
+	ut_ad(!index->table->skip_alter_undo);
+
+	n_unique = dict_index_get_n_unique(index);
+
+	clust_index = dict_table_get_first_index(index->table);
+
+	for (i = 0; i < n_unique; i++) {
+
+		const dict_field_t*	ind_field;
+		const dict_col_t*	col;
+		ulint			col_no;
+		const upd_field_t*	upd_field;
+		const dfield_t*		dfield;
+		dfield_t		dfield_ext;
+		ulint			dfield_len= 0;
+		const byte*		buf;
+		bool			is_virtual;
+		const dict_v_col_t*	vcol = NULL;
+
+		ind_field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(ind_field);
+		col_no = dict_col_get_no(col);
+		is_virtual = col->is_virtual();
+
+		if (is_virtual) {
+			vcol = reinterpret_cast<const dict_v_col_t*>(col);
+
+			upd_field = upd_get_field_by_field_no(
+				update, vcol->v_pos, true);
+		} else {
+			upd_field = upd_get_field_by_field_no(
+				update, static_cast<uint16_t>(
+					dict_col_get_clust_pos(
+						col, clust_index)),
+				false);
+		}
+
+		if (upd_field == NULL) {
+			continue;
+		}
+
+		if (row == NULL) {
+			ut_ad(ext == NULL);
+			return(TRUE);
+		}
+
+		if (is_virtual) {
+			dfield = dtuple_get_nth_v_field(
+				row,  vcol->v_pos);
+		} else {
+			dfield = dtuple_get_nth_field(row, col_no);
+		}
+
+		/* For spatial index update, since the different geometry
+		data could generate same MBR, so, if the new index entry is
+		same as old entry, which means the MBR is not changed, we
+		don't need to do anything. */
+		if (dict_index_is_spatial(index) && i == 0) {
+			double		mbr1[SPDIMS * 2];
+			double		mbr2[SPDIMS * 2];
+			rtr_mbr_t*	old_mbr;
+			rtr_mbr_t*	new_mbr;
+			const uchar*	dptr = NULL;
+			ulint		flen = 0;
+			ulint		dlen = 0;
+			mem_heap_t*	temp_heap = NULL;
+			const dfield_t*	new_field = &upd_field->new_val;
+
+			const ulint zip_size = ext
+				? ext->zip_size
+				: index->table->space->zip_size();
+
+			ut_ad(dfield->data != NULL
+			      && dfield->len > GEO_DATA_HEADER_SIZE);
+			ut_ad(dict_col_get_spatial_status(col) != SPATIAL_NONE);
+
+			/* Get the old mbr. */
+			if (dfield_is_ext(dfield)) {
+				/* For off-page stored data, we
+				need to read the whole field data. */
+				flen = dfield_get_len(dfield);
+				dptr = static_cast<const byte*>(
+					dfield_get_data(dfield));
+				temp_heap = mem_heap_create(1000);
+
+				dptr = btr_copy_externally_stored_field(
+					&dlen, dptr,
+					zip_size,
+					flen,
+					temp_heap);
+			} else {
+				dptr = static_cast<const uchar*>(dfield->data);
+				dlen = dfield->len;
+			}
+
+			rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+					   static_cast<uint>(dlen
+					   - GEO_DATA_HEADER_SIZE),
+					   SPDIMS, mbr1);
+			old_mbr = reinterpret_cast<rtr_mbr_t*>(mbr1);
+
+			/* Get the new mbr. */
+			if (dfield_is_ext(new_field)) {
+				if (flag == ROW_BUILD_FOR_UNDO
+				    && dict_table_has_atomic_blobs(
+					    index->table)) {
+					/* For ROW_FORMAT=DYNAMIC
+					or COMPRESSED, a prefix of
+					off-page records is stored
+					in the undo log record
+					(for any column prefix indexes).
+					For SPATIAL INDEX, we must
+					ignore this prefix. The
+					full column value is stored in
+					the BLOB.
+					For non-spatial index, we
+					would have already fetched a
+					necessary prefix of the BLOB,
+					available in the "ext" parameter.
+
+					Here, for SPATIAL INDEX, we are
+					fetching the full column, which is
+					potentially wasting a lot of I/O,
+					memory, and possibly involving a
+					concurrency problem, similar to ones
+					that existed before the introduction
+					of row_ext_t.
+
+					MDEV-11657 FIXME: write the MBR
+					directly to the undo log record,
+					and avoid recomputing it here! */
+					flen = BTR_EXTERN_FIELD_REF_SIZE;
+					ut_ad(dfield_get_len(new_field) >=
+					      BTR_EXTERN_FIELD_REF_SIZE);
+					dptr = static_cast<const byte*>(
+						dfield_get_data(new_field))
+						+ dfield_get_len(new_field)
+						- BTR_EXTERN_FIELD_REF_SIZE;
+				} else {
+					flen = dfield_get_len(new_field);
+					dptr = static_cast<const byte*>(
+						dfield_get_data(new_field));
+				}
+
+				if (temp_heap == NULL) {
+					temp_heap = mem_heap_create(1000);
+				}
+
+				dptr = btr_copy_externally_stored_field(
+					&dlen, dptr,
+					zip_size,
+					flen,
+					temp_heap);
+			} else {
+				dptr = static_cast<const byte*>(
+					upd_field->new_val.data);
+				dlen = upd_field->new_val.len;
+			}
+			rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+					   static_cast<uint>(dlen
+					   - GEO_DATA_HEADER_SIZE),
+					   SPDIMS, mbr2);
+			new_mbr = reinterpret_cast<rtr_mbr_t*>(mbr2);
+
+			if (temp_heap) {
+				mem_heap_free(temp_heap);
+			}
+
+			if (!MBR_EQUAL_CMP(old_mbr, new_mbr)) {
+				return(TRUE);
+			} else {
+				continue;
+			}
+		}
+
+		/* This treatment of column prefix indexes is loosely
+		based on row_build_index_entry(). */
+
+		if (UNIV_LIKELY(ind_field->prefix_len == 0)
+		    || dfield_is_null(dfield)) {
+			/* do nothing special */
+		} else if (ext) {
+			/* Silence a compiler warning without
+			silencing a Valgrind error. */
+			dfield_len = 0;
+			MEM_UNDEFINED(&dfield_len, sizeof dfield_len);
+			/* See if the column is stored externally. */
+			buf = row_ext_lookup(ext, col_no, &dfield_len);
+
+			ut_ad(col->ord_part);
+
+			if (UNIV_LIKELY_NULL(buf)) {
+				if (UNIV_UNLIKELY(buf == field_ref_zero)) {
+					/* The externally stored field
+					was not written yet. This
+					record should only be seen by
+					trx_rollback_recovered()
+					when the server had crashed before
+					storing the field. */
+					ut_ad(!thr
+					      || thr->graph->trx->is_recovered);
+					ut_ad(!thr
+					      || thr->graph->trx
+					         == trx_roll_crash_recv_trx);
+					return(TRUE);
+				}
+
+				goto copy_dfield;
+			}
+		} else if (dfield_is_ext(dfield)) {
+			dfield_len = dfield_get_len(dfield);
+			ut_a(dfield_len > BTR_EXTERN_FIELD_REF_SIZE);
+			dfield_len -= BTR_EXTERN_FIELD_REF_SIZE;
+			ut_a(dict_index_is_clust(index)
+			     || ind_field->prefix_len <= dfield_len);
+
+			buf= static_cast<const byte*>(dfield_get_data(dfield));
+copy_dfield:
+			ut_a(dfield_len > 0);
+			dfield_copy(&dfield_ext, dfield);
+			dfield_set_data(&dfield_ext, buf, dfield_len);
+			dfield = &dfield_ext;
+		}
+
+		if (!dfield_datas_are_binary_equal(
+			    dfield, &upd_field->new_val,
+			    ind_field->prefix_len)) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector may change an ordering field in an index
+record */
+ibool
+row_upd_changes_some_index_ord_field_binary(
+/*========================================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const upd_t*		update)	/*!< in: update vector for the row */
+{
+	upd_field_t*	upd_field;
+	dict_index_t*	index;
+	ulint		i;
+
+	index = dict_table_get_first_index(table);
+
+	for (i = 0; i < upd_get_n_fields(update); i++) {
+
+		upd_field = upd_get_nth_field(update, i);
+
+		if (upd_fld_is_virtual_col(upd_field)) {
+			if (dict_table_get_nth_v_col(index->table,
+						     upd_field->field_no)
+			    ->m_col.ord_part) {
+				return(TRUE);
+			}
+		} else {
+			if (dict_field_get_col(dict_index_get_nth_field(
+				index, upd_field->field_no))->ord_part) {
+				return(TRUE);
+			}
+		}
+	}
+
+	return(FALSE);
+}
+
+/***********************************************************//**
+Checks if an FTS Doc ID column is affected by an UPDATE.
+@return whether the Doc ID column is changed */
+bool
+row_upd_changes_doc_id(
+/*===================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	upd_field)	/*!< in: field to check */
+{
+	ulint		col_no;
+	dict_index_t*	clust_index;
+	fts_t*		fts = table->fts;
+
+	ut_ad(!table->skip_alter_undo);
+
+	clust_index = dict_table_get_first_index(table);
+
+	/* Convert from index-specific column number to table-global
+	column number. */
+	col_no = dict_index_get_nth_col_no(clust_index, upd_field->field_no);
+
+	return(col_no == fts->doc_col);
+}
+/***********************************************************//**
+Checks if an FTS indexed column is affected by an UPDATE.
+@return offset within fts_t::indexes if FTS indexed column updated else
+ULINT_UNDEFINED */
+ulint
+row_upd_changes_fts_column(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	upd_field)	/*!< in: field to check */
+{
+	ulint		col_no;
+	dict_index_t*	clust_index;
+	fts_t*		fts = table->fts;
+
+	ut_ad(!table->skip_alter_undo);
+
+	if (upd_fld_is_virtual_col(upd_field)) {
+		col_no = upd_field->field_no;
+		return(dict_table_is_fts_column(fts->indexes, col_no, true));
+	} else {
+		clust_index = dict_table_get_first_index(table);
+
+		/* Convert from index-specific column number to table-global
+		column number. */
+		col_no = dict_index_get_nth_col_no(clust_index,
+						   upd_field->field_no);
+		return(dict_table_is_fts_column(fts->indexes, col_no, false));
+	}
+
+}
+
+/***********************************************************//**
+Checks if an update vector changes some of the first ordering fields of an
+index record. This is only used in foreign key checks and we can assume
+that index does not contain column prefixes.
+@return TRUE if changes */
+static
+ibool
+row_upd_changes_first_fields_binary(
+/*================================*/
+	dtuple_t*	entry,	/*!< in: index entry */
+	dict_index_t*	index,	/*!< in: index of entry */
+	const upd_t*	update,	/*!< in: update vector for the row */
+	ulint		n)	/*!< in: how many first fields to check */
+{
+	ulint		n_upd_fields;
+	ulint		i, j;
+	dict_index_t*	clust_index;
+
+	ut_ad(update && index);
+	ut_ad(n <= dict_index_get_n_fields(index));
+
+	n_upd_fields = upd_get_n_fields(update);
+	clust_index = dict_table_get_first_index(index->table);
+
+	for (i = 0; i < n; i++) {
+
+		const dict_field_t*	ind_field;
+		const dict_col_t*	col;
+		ulint			col_pos;
+
+		ind_field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(ind_field);
+		col_pos = dict_col_get_clust_pos(col, clust_index);
+
+		ut_a(ind_field->prefix_len == 0);
+
+		for (j = 0; j < n_upd_fields; j++) {
+
+			upd_field_t*	upd_field
+				= upd_get_nth_field(update, j);
+
+			if (col_pos == upd_field->field_no
+			    && !dfield_datas_are_binary_equal(
+				    dtuple_get_nth_field(entry, i),
+				    &upd_field->new_val, 0)) {
+
+				return(TRUE);
+			}
+		}
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Copies the column values from a record. */
+UNIV_INLINE
+void
+row_upd_copy_columns(
+/*=================*/
+	rec_t*		rec,	/*!< in: record in a clustered index */
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	const dict_index_t*	index, /*!< in: index of rec */
+	sym_node_t*	column)	/*!< in: first column in a column list, or
+				NULL */
+{
+	ut_ad(dict_index_is_clust(index));
+
+	const byte*	data;
+	ulint	len;
+
+	while (column) {
+		data = rec_get_nth_cfield(
+			rec, index, offsets,
+			column->field_nos[SYM_CLUST_FIELD_NO], &len);
+		eval_node_copy_and_alloc_val(column, data, len);
+
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+}
+
+/*********************************************************************//**
+Calculates the new values for fields to update. Note that row_upd_copy_columns
+must have been called first. */
+UNIV_INLINE
+void
+row_upd_eval_new_vals(
+/*==================*/
+	upd_t*	update)	/*!< in/out: update vector */
+{
+	que_node_t*	exp;
+	upd_field_t*	upd_field;
+	ulint		n_fields;
+	ulint		i;
+
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		upd_field = upd_get_nth_field(update, i);
+
+		exp = upd_field->exp;
+
+		eval_exp(exp);
+
+		dfield_copy_data(&(upd_field->new_val), que_node_get_val(exp));
+	}
+}
+
+/** Stores to the heap the virtual columns that need for any indexes
+@param[in,out]	node		row update node
+@param[in]	update		an update vector if it is update
+@param[in]	thd		mysql thread handle
+@param[in,out]	mysql_table	mysql table object
+@return true if success
+	false if virtual column value computation fails. */
+static
+bool
+row_upd_store_v_row(
+	upd_node_t*	node,
+	const upd_t*	update,
+	THD*		thd,
+	TABLE*		mysql_table)
+{
+	dict_index_t*	index = dict_table_get_first_index(node->table);
+	ib_vcol_row	vc(NULL);
+
+	for (ulint col_no = 0; col_no < dict_table_get_n_v_cols(node->table);
+	     col_no++) {
+
+		const dict_v_col_t*     col
+			= dict_table_get_nth_v_col(node->table, col_no);
+
+		if (col->m_col.ord_part) {
+			dfield_t*	dfield
+				= dtuple_get_nth_v_field(node->row, col_no);
+			ulint		n_upd
+				= update ? upd_get_n_fields(update) : 0;
+			ulint		i = 0;
+
+			/* Check if the value is already in update vector */
+			for (i = 0; i < n_upd; i++) {
+				const upd_field_t*      upd_field
+					= upd_get_nth_field(update, i);
+				if (!(upd_field->new_val.type.prtype
+				      & DATA_VIRTUAL)
+				    || upd_field->field_no != col->v_pos) {
+					continue;
+				}
+
+				dfield_copy_data(dfield, upd_field->old_v_val);
+				dfield_dup(dfield, node->heap);
+				break;
+			}
+
+			/* Not updated */
+			if (i >= n_upd) {
+				/* If this is an update, then the value
+				should be in update->old_vrow */
+				if (update) {
+					if (update->old_vrow == NULL) {
+						/* This only happens in
+						cascade update. And virtual
+						column can't be affected,
+						so it is Ok to set it to NULL */
+						dfield_set_null(dfield);
+					} else {
+						dfield_t*       vfield
+							= dtuple_get_nth_v_field(
+								update->old_vrow,
+								col_no);
+						dfield_copy_data(dfield, vfield);
+						dfield_dup(dfield, node->heap);
+					}
+				} else {
+					uchar *record = vc.record(thd, index,
+								  &mysql_table);
+					/* Need to compute, this happens when
+					deleting row */
+					dfield_t* vfield =
+						innobase_get_computed_value(
+							node->row, col, index,
+							&vc.heap, node->heap,
+							NULL, thd, mysql_table,
+							record, NULL, NULL);
+					if (vfield == NULL) {
+						return false;
+					}
+				}
+			}
+		}
+	}
+
+	return true;
+}
+
+/** Stores to the heap the row on which the node->pcur is positioned.
+@param[in]	node		row update node
+@param[in]	thd		mysql thread handle
+@param[in,out]	mysql_table	NULL, or mysql table object when
+				user thread invokes dml
+@return false if virtual column value computation fails
+	true otherwise. */
+static
+bool
+row_upd_store_row(
+	upd_node_t*	node,
+	THD*		thd,
+	TABLE*		mysql_table)
+{
+	dict_index_t*	clust_index;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	row_ext_t**	ext;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	const rec_offs*	offsets;
+	rec_offs_init(offsets_);
+
+	ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES);
+
+	if (node->row != NULL) {
+		mem_heap_empty(node->heap);
+	}
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	rec = btr_pcur_get_rec(node->pcur);
+
+	offsets = rec_get_offsets(rec, clust_index, offsets_,
+				  clust_index->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+
+	if (dict_table_has_atomic_blobs(node->table)) {
+		/* There is no prefix of externally stored columns in
+		the clustered index record. Build a cache of column
+		prefixes. */
+		ext = &node->ext;
+	} else {
+		/* REDUNDANT and COMPACT formats store a local
+		768-byte prefix of each externally stored column.
+		No cache is needed. */
+		ext = NULL;
+		node->ext = NULL;
+	}
+
+	node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets,
+			      NULL, NULL, NULL, ext, node->heap);
+
+	if (node->table->n_v_cols) {
+		bool ok = row_upd_store_v_row(node,
+				    node->is_delete ? NULL : node->update,
+				    thd, mysql_table);
+		if (!ok) {
+			return false;
+		}
+	}
+
+	if (node->is_delete == PLAIN_DELETE) {
+		node->upd_row = NULL;
+		node->upd_ext = NULL;
+	} else {
+		node->upd_row = dtuple_copy(node->row, node->heap);
+		row_upd_replace(node->upd_row, &node->upd_ext,
+				clust_index, node->update, node->heap);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return true;
+}
+
+/***********************************************************//**
+Updates a secondary index entry of a row.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_sec_index_entry(
+/*====================*/
+	upd_node_t*	node,	/*!< in: row update node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	mtr_t			mtr;
+	btr_pcur_t		pcur;
+	mem_heap_t*		heap;
+	dtuple_t*		entry;
+	dict_index_t*		index;
+	dberr_t			err	= DB_SUCCESS;
+	trx_t*			trx	= thr_get_trx(thr);
+	btr_latch_mode		mode;
+	ulint			flags;
+	enum row_search_result	search_result;
+
+	ut_ad(trx->id != 0);
+
+	index = node->index;
+	ut_ad(index->is_committed());
+
+	/* For secondary indexes, index->online_status==ONLINE_INDEX_COMPLETE
+	if index->is_committed(). */
+	ut_ad(!dict_index_is_online_ddl(index));
+
+	const bool referenced = row_upd_index_is_referenced(index, trx);
+#ifdef WITH_WSREP
+	const bool foreign = wsrep_row_upd_index_is_foreign(index, trx);
+#endif /* WITH_WSREP */
+
+	heap = mem_heap_create(1024);
+
+	/* Build old index entry */
+	entry = row_build_index_entry(node->row, node->ext, index, heap);
+	ut_a(entry);
+
+	log_free_check();
+
+	DEBUG_SYNC_C_IF_THD(trx->mysql_thd,
+			    "before_row_upd_sec_index_entry");
+
+	mtr.start();
+	mode = BTR_MODIFY_LEAF;
+
+	switch (index->table->space_id) {
+	case SRV_TMP_SPACE_ID:
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+		flags = BTR_NO_LOCKING_FLAG;
+		break;
+	default:
+		index->set_modified(mtr);
+		/* fall through */
+	case IBUF_SPACE_ID:
+		flags = index->table->no_rollback() ? BTR_NO_ROLLBACK : 0;
+		/* We can only buffer delete-mark operations if there
+		are no foreign key constraints referring to the index. */
+		if (!referenced) {
+			mode = BTR_DELETE_MARK_LEAF;
+		}
+		break;
+	}
+
+	/* Set the query thread, so that ibuf_insert_low() will be
+	able to invoke thd_get_trx(). */
+	pcur.btr_cur.thr = thr;
+	pcur.btr_cur.page_cur.index = index;
+
+	if (index->is_spatial()) {
+		mode = btr_latch_mode(BTR_MODIFY_LEAF | BTR_RTREE_DELETE_MARK);
+		if (UNIV_LIKELY(!rtr_search(entry, mode, &pcur, &mtr))) {
+			goto found;
+		}
+
+		if (pcur.btr_cur.rtr_info->fd_del) {
+			/* We found the record, but a delete marked */
+			goto close;
+		}
+
+		goto not_found;
+	}
+
+	search_result = row_search_index_entry(entry, mode, &pcur, &mtr);
+
+	switch (search_result) {
+	const rec_t* rec;
+	case ROW_NOT_DELETED_REF:	/* should only occur for BTR_DELETE */
+		ut_error;
+		break;
+	case ROW_BUFFERED:
+		/* Entry was delete marked already. */
+		break;
+
+	case ROW_NOT_FOUND:
+not_found:
+		rec = btr_pcur_get_rec(&pcur);
+		ib::error()
+			<< "Record in index " << index->name
+			<< " of table " << index->table->name
+			<< " was not found on update: " << *entry
+			<< " at: " << rec_index_print(rec, index);
+#ifdef UNIV_DEBUG
+		mtr_commit(&mtr);
+		mtr_start(&mtr);
+		ut_ad(btr_validate_index(index, 0) == DB_SUCCESS);
+		ut_ad(0);
+#endif /* UNIV_DEBUG */
+		break;
+	case ROW_FOUND:
+found:
+		ut_ad(err == DB_SUCCESS);
+		rec = btr_pcur_get_rec(&pcur);
+
+		/* Delete mark the old index record; it can already be
+		delete marked if we return after a lock wait in
+		row_ins_sec_index_entry() below */
+		if (!rec_get_deleted_flag(
+			    rec, dict_table_is_comp(index->table))) {
+			err = lock_sec_rec_modify_check_and_lock(
+				flags,
+				btr_pcur_get_block(&pcur),
+				btr_pcur_get_rec(&pcur), index, thr, &mtr);
+			if (err != DB_SUCCESS) {
+				break;
+			}
+
+			btr_rec_set_deleted<true>(btr_pcur_get_block(&pcur),
+						  btr_pcur_get_rec(&pcur),
+						  &mtr);
+#ifdef WITH_WSREP
+			if (!referenced && foreign
+			    && wsrep_must_process_fk(node, trx)
+			    && !wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
+
+				rec_offs* offsets = rec_get_offsets(
+					rec, index, NULL, index->n_core_fields,
+					ULINT_UNDEFINED, &heap);
+
+				err = wsrep_row_upd_check_foreign_constraints(
+					node, &pcur, index->table,
+					index, offsets, thr, &mtr);
+
+				switch (err) {
+				case DB_SUCCESS:
+				case DB_NO_REFERENCED_ROW:
+					err = DB_SUCCESS;
+					break;
+				case DB_LOCK_WAIT:
+				case DB_DEADLOCK:
+				case DB_LOCK_WAIT_TIMEOUT:
+					WSREP_DEBUG("Foreign key check fail: "
+						"%s on table %s index %s query %s",
+						ut_strerr(err), index->name(), index->table->name.m_name,
+						wsrep_thd_query(trx->mysql_thd));
+					break;
+				default:
+					WSREP_ERROR("Foreign key check fail: "
+						"%s on table %s index %s query %s",
+						ut_strerr(err), index->name(), index->table->name.m_name,
+						wsrep_thd_query(trx->mysql_thd));
+					break;
+				}
+			}
+#endif /* WITH_WSREP */
+		}
+
+#ifdef WITH_WSREP
+		ut_ad(err == DB_SUCCESS || err == DB_LOCK_WAIT
+		      || err == DB_DEADLOCK || err == DB_LOCK_WAIT_TIMEOUT);
+#else
+		ut_ad(err == DB_SUCCESS);
+#endif
+
+		if (referenced) {
+			rec_offs* offsets = rec_get_offsets(
+				rec, index, NULL, index->n_core_fields,
+				ULINT_UNDEFINED, &heap);
+
+			/* NOTE that the following call loses
+			the position of pcur ! */
+			err = row_upd_check_references_constraints(
+				node, &pcur, index->table,
+				index, offsets, thr, &mtr);
+		}
+	}
+
+close:
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	if (node->is_delete == PLAIN_DELETE || err != DB_SUCCESS) {
+
+		goto func_exit;
+	}
+
+	mem_heap_empty(heap);
+
+	DEBUG_SYNC_C_IF_THD(trx->mysql_thd,
+			    "before_row_upd_sec_new_index_entry");
+
+	/* Build a new index entry */
+	entry = row_build_index_entry(node->upd_row, node->upd_ext,
+				      index, heap);
+	ut_a(entry);
+
+	/* Insert new index entry */
+	err = row_ins_sec_index_entry(index, entry, thr, !node->is_delete);
+
+func_exit:
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/***********************************************************//**
+Updates the secondary index record if it is changed in the row update or
+deletes it if this is a delete.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_sec_step(
+/*=============*/
+	upd_node_t*	node,	/*!< in: row update node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ut_ad((node->state == UPD_NODE_UPDATE_ALL_SEC)
+	      || (node->state == UPD_NODE_UPDATE_SOME_SEC));
+	ut_ad(!dict_index_is_clust(node->index));
+
+	if (node->state == UPD_NODE_UPDATE_ALL_SEC
+	    || row_upd_changes_ord_field_binary(node->index, node->update,
+						thr, node->row, node->ext)) {
+		return(row_upd_sec_index_entry(node, thr));
+	}
+
+	return(DB_SUCCESS);
+}
+
+#ifdef UNIV_DEBUG
+# define row_upd_clust_rec_by_insert_inherit(rec,index,offsets,entry,update) \
+	row_upd_clust_rec_by_insert_inherit_func(rec,index,offsets,entry,update)
+#else /* UNIV_DEBUG */
+# define row_upd_clust_rec_by_insert_inherit(rec,index,offsets,entry,update) \
+	row_upd_clust_rec_by_insert_inherit_func(rec,entry,update)
+#endif /* UNIV_DEBUG */
+/*******************************************************************//**
+Mark non-updated off-page columns inherited when the primary key is
+updated. We must mark them as inherited in entry, so that they are not
+freed in a rollback. A limited version of this function used to be
+called btr_cur_mark_dtuple_inherited_extern().
+@return whether any columns were inherited */
+static
+bool
+row_upd_clust_rec_by_insert_inherit_func(
+/*=====================================*/
+	const rec_t*	rec,	/*!< in: old record, or NULL */
+#ifdef UNIV_DEBUG
+	dict_index_t*	index,	/*!< in: index, or NULL */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec), or NULL */
+#endif /* UNIV_DEBUG */
+	dtuple_t*	entry,	/*!< in/out: updated entry to be
+				inserted into the clustered index */
+	const upd_t*	update)	/*!< in: update vector */
+{
+	bool	inherit	= false;
+
+	ut_ad(!rec == !offsets);
+	ut_ad(!rec == !index);
+	ut_ad(!rec || rec_offs_validate(rec, index, offsets));
+	ut_ad(!rec || rec_offs_any_extern(offsets));
+
+	for (uint16_t i = 0; i < dtuple_get_n_fields(entry); i++) {
+		dfield_t*	dfield	= dtuple_get_nth_field(entry, i);
+		byte*		data;
+		ulint		len;
+
+		ut_ad(!offsets
+		      || !rec_offs_nth_extern(offsets, i)
+		      == !dfield_is_ext(dfield)
+		      || (!dict_index_get_nth_field(index, i)->name
+			  && !dfield_is_ext(dfield)
+			  && (dfield_is_null(dfield) || dfield->len == 0))
+		      || upd_get_field_by_field_no(update, i, false));
+		if (!dfield_is_ext(dfield)
+		    || upd_get_field_by_field_no(update, i, false)) {
+			continue;
+		}
+
+#ifdef UNIV_DEBUG
+		if (UNIV_LIKELY(rec != NULL)) {
+			ut_ad(!rec_offs_nth_default(offsets, i));
+			const byte* rec_data
+				= rec_get_nth_field(rec, offsets, i, &len);
+			ut_ad(len == dfield_get_len(dfield));
+			ut_ad(len != UNIV_SQL_NULL);
+			ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+			rec_data += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+			/* The pointer must not be zero. */
+			ut_ad(memcmp(rec_data, field_ref_zero,
+				     BTR_EXTERN_FIELD_REF_SIZE));
+			/* The BLOB must be owned. */
+			ut_ad(!(rec_data[BTR_EXTERN_LEN]
+				& BTR_EXTERN_OWNER_FLAG));
+		}
+#endif /* UNIV_DEBUG */
+
+		len = dfield_get_len(dfield);
+		ut_a(len != UNIV_SQL_NULL);
+		ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+		data = static_cast<byte*>(dfield_get_data(dfield));
+
+		data += len - BTR_EXTERN_FIELD_REF_SIZE;
+		/* The pointer must not be zero. */
+		ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
+
+		/* The BLOB must be owned, unless we are resuming from
+		a lock wait and we already had disowned the BLOB. */
+		ut_a(rec == NULL
+		     || !(data[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
+		data[BTR_EXTERN_LEN] &= byte(~BTR_EXTERN_OWNER_FLAG);
+		data[BTR_EXTERN_LEN] |= BTR_EXTERN_INHERITED_FLAG;
+		/* The BTR_EXTERN_INHERITED_FLAG only matters in
+		rollback of a fresh insert. Purge will always free
+		the extern fields of a delete-marked row. */
+
+		inherit = true;
+	}
+
+	return(inherit);
+}
+
+/***********************************************************//**
+Marks the clustered index record deleted and inserts the updated version
+of the record to the index. This function should be used when the ordering
+fields of the clustered index record change. This should be quite rare in
+database applications.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_clust_rec_by_insert(
+/*========================*/
+	upd_node_t*	node,	/*!< in/out: row update node */
+	dict_index_t*	index,	/*!< in: clustered index of the record */
+	que_thr_t*	thr,	/*!< in: query thread */
+	bool		referenced,/*!< in: whether index may be referenced in
+				a foreign key constraint */
+#ifdef WITH_WSREP
+	bool		foreign,/*!< in: whether this is a foreign key */
+#endif
+	mtr_t*		mtr)	/*!< in/out: mini-transaction,
+				may be committed and restarted */
+{
+	mem_heap_t*	heap;
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	trx_t*		trx;
+	dict_table_t*	table;
+	dtuple_t*	entry;
+	dberr_t		err;
+	rec_t*		rec;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets			= offsets_;
+
+	ut_ad(dict_index_is_clust(index));
+
+	rec_offs_init(offsets_);
+
+	trx = thr_get_trx(thr);
+	table = node->table;
+	pcur = node->pcur;
+	btr_cur	= btr_pcur_get_btr_cur(pcur);
+
+	heap = mem_heap_create(1000);
+
+	entry = row_build_index_entry_low(node->upd_row, node->upd_ext,
+					  index, heap, ROW_BUILD_FOR_INSERT);
+	if (index->is_instant()) entry->trim(*index);
+	ut_ad(dtuple_get_info_bits(entry) == 0);
+
+	{
+		dfield_t* t = dtuple_get_nth_field(entry, index->db_trx_id());
+		ut_ad(t->len == DATA_TRX_ID_LEN);
+		trx_write_trx_id(static_cast<byte*>(t->data), trx->id);
+	}
+
+	switch (node->state) {
+	default:
+		ut_error;
+	case UPD_NODE_INSERT_CLUSTERED:
+		/* A lock wait occurred in row_ins_clust_index_entry() in
+		the previous invocation of this function. */
+		row_upd_clust_rec_by_insert_inherit(
+			NULL, NULL, NULL, entry, node->update);
+		break;
+	case UPD_NODE_UPDATE_CLUSTERED:
+		/* This is the first invocation of the function where
+		we update the primary key.  Delete-mark the old record
+		in the clustered index and prepare to insert a new entry. */
+		rec = btr_cur_get_rec(btr_cur);
+		offsets = rec_get_offsets(rec, index, offsets,
+					  index->n_core_fields,
+					  ULINT_UNDEFINED, &heap);
+		ut_ad(page_rec_is_user_rec(rec));
+
+		if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
+			/* If the clustered index record is already delete
+			marked, then we are here after a DB_LOCK_WAIT.
+			Skip delete marking clustered index and disowning
+			its blobs. */
+			ut_ad(row_get_rec_trx_id(rec, index, offsets)
+			      == trx->id);
+			ut_ad(!trx_undo_roll_ptr_is_insert(
+			              row_get_rec_roll_ptr(rec, index,
+							   offsets)));
+			goto check_fk;
+		}
+
+		err = btr_cur_del_mark_set_clust_rec(
+			btr_cur_get_block(btr_cur), rec, index, offsets,
+			thr, node->row, mtr);
+		if (err != DB_SUCCESS) {
+			goto err_exit;
+		}
+
+		/* If the the new row inherits externally stored
+		fields (off-page columns a.k.a. BLOBs) from the
+		delete-marked old record, mark them disowned by the
+		old record and owned by the new entry. */
+
+		if (rec_offs_any_extern(offsets)) {
+			if (row_upd_clust_rec_by_insert_inherit(
+				    rec, index, offsets,
+				    entry, node->update)) {
+				/* The blobs are disowned here, expecting the
+				insert down below to inherit them.  But if the
+				insert fails, then this disown will be undone
+				when the operation is rolled back. */
+				btr_cur_disown_inherited_fields(
+					btr_cur_get_block(btr_cur),
+					rec, index, offsets, node->update,
+					mtr);
+			}
+		}
+check_fk:
+		if (referenced) {
+			/* NOTE that the following call loses
+			the position of pcur ! */
+
+			err = row_upd_check_references_constraints(
+				node, pcur, table, index, offsets, thr, mtr);
+
+			if (err != DB_SUCCESS) {
+				goto err_exit;
+			}
+#ifdef WITH_WSREP
+		} else if (foreign && wsrep_must_process_fk(node, trx)) {
+			err = wsrep_row_upd_check_foreign_constraints(
+				node, pcur, table, index, offsets, thr, mtr);
+
+			switch (err) {
+			case DB_SUCCESS:
+			case DB_NO_REFERENCED_ROW:
+				err = DB_SUCCESS;
+				break;
+			case DB_LOCK_WAIT:
+			case DB_DEADLOCK:
+			case DB_LOCK_WAIT_TIMEOUT:
+				WSREP_DEBUG("Foreign key check fail: "
+					    "%s on table %s index %s query %s",
+					    ut_strerr(err), index->name(), index->table->name.m_name,
+					    wsrep_thd_query(trx->mysql_thd));
+
+				goto err_exit;
+			default:
+				WSREP_ERROR("Foreign key check fail: "
+					    "%s on table %s index %s query %s",
+					    ut_strerr(err), index->name(), index->table->name.m_name,
+					    wsrep_thd_query(trx->mysql_thd));
+
+				goto err_exit;
+			}
+#endif /* WITH_WSREP */
+		}
+	}
+
+	mtr->commit();
+	mtr->start();
+
+	node->state = UPD_NODE_INSERT_CLUSTERED;
+	err = row_ins_clust_index_entry(index, entry, thr,
+					dtuple_get_n_ext(entry));
+err_exit:
+	mem_heap_free(heap);
+	return(err);
+}
+
+/***********************************************************//**
+Updates a clustered index record of a row when the ordering fields do
+not change.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_clust_rec(
+/*==============*/
+	ulint		flags,  /*!< in: undo logging and locking flags */
+	upd_node_t*	node,	/*!< in: row update node */
+	dict_index_t*	index,	/*!< in: clustered index */
+	rec_offs*	offsets,/*!< in: rec_get_offsets() on node->pcur */
+	mem_heap_t**	offsets_heap,
+				/*!< in/out: memory heap, can be emptied */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in,out: mini-transaction; may be
+				committed and restarted here */
+{
+	mem_heap_t*	heap		= NULL;
+	big_rec_t*	big_rec		= NULL;
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	dberr_t		err;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(!thr_get_trx(thr)->in_rollback);
+	ut_ad(!node->table->skip_alter_undo);
+
+	pcur = node->pcur;
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	ut_ad(btr_cur_get_index(btr_cur) == index);
+	ut_ad(!rec_get_deleted_flag(btr_cur_get_rec(btr_cur),
+				    dict_table_is_comp(index->table)));
+	ut_ad(rec_offs_validate(btr_cur_get_rec(btr_cur), index, offsets));
+
+	/* Try optimistic updating of the record, keeping changes within
+	the page; we do not check locks because we assume the x-lock on the
+	record to update */
+
+	if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) {
+		err = btr_cur_update_in_place(
+			flags | BTR_NO_LOCKING_FLAG, btr_cur,
+			offsets, node->update,
+			node->cmpl_info, thr, thr_get_trx(thr)->id, mtr);
+	} else {
+		err = btr_cur_optimistic_update(
+			flags | BTR_NO_LOCKING_FLAG, btr_cur,
+			&offsets, offsets_heap, node->update,
+			node->cmpl_info, thr, thr_get_trx(thr)->id, mtr);
+	}
+
+	if (err == DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	if (buf_pool.running_out()) {
+		err = DB_LOCK_TABLE_FULL;
+		goto func_exit;
+	}
+
+	/* We may have to modify the tree structure: do a pessimistic descent
+	down the index tree */
+
+	mtr->commit();
+	mtr->start();
+
+	if (index->table->is_temporary()) {
+		/* Disable locking, because temporary tables are never
+		shared between transactions or connections. */
+		flags |= BTR_NO_LOCKING_FLAG;
+		mtr->set_log_mode(MTR_LOG_NO_REDO);
+	} else {
+		index->set_modified(*mtr);
+	}
+
+	/* NOTE: this transaction has an s-lock or x-lock on the record and
+	therefore other transactions cannot modify the record when we have no
+	latch on the page. In addition, we assume that other query threads of
+	the same transaction do not modify the record in the meantime.
+	Therefore we can assert that the restoration of the cursor succeeds. */
+
+	ut_a(pcur->restore_position(BTR_MODIFY_TREE, mtr) ==
+	    btr_pcur_t::SAME_ALL);
+
+	ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+				    dict_table_is_comp(index->table)));
+
+	if (!heap) {
+		heap = mem_heap_create(1024);
+	}
+
+	err = btr_cur_pessimistic_update(
+		flags | BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG, btr_cur,
+		&offsets, offsets_heap, heap, &big_rec,
+		node->update, node->cmpl_info,
+		thr, thr_get_trx(thr)->id, mtr);
+	if (big_rec) {
+		ut_a(err == DB_SUCCESS);
+
+		DEBUG_SYNC_C("before_row_upd_extern");
+		err = btr_store_big_rec_extern_fields(
+			pcur, offsets, big_rec, mtr, BTR_STORE_UPDATE);
+		DEBUG_SYNC_C("after_row_upd_extern");
+	}
+
+func_exit:
+	if (heap) {
+		mem_heap_free(heap);
+	}
+
+	if (big_rec) {
+		dtuple_big_rec_free(big_rec);
+	}
+
+	return(err);
+}
+
+/***********************************************************//**
+Delete marks a clustered index record.
+@return DB_SUCCESS if operation successfully completed, else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_del_mark_clust_rec(
+/*=======================*/
+	upd_node_t*	node,	/*!< in: row update node */
+	dict_index_t*	index,	/*!< in: clustered index */
+	rec_offs*	offsets,/*!< in/out: rec_get_offsets() for the
+				record under the cursor */
+	que_thr_t*	thr,	/*!< in: query thread */
+	bool		referenced,
+				/*!< in: whether index may be referenced in
+				a foreign key constraint */
+#ifdef WITH_WSREP
+	bool		foreign,/*!< in: whether this is a foreign key */
+#endif
+	mtr_t*		mtr)	/*!< in,out: mini-transaction;
+				will be committed and restarted */
+{
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	rec_t*		rec;
+	trx_t*		trx = thr_get_trx(thr);
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(node->is_delete == PLAIN_DELETE);
+
+	pcur = node->pcur;
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	/* Store row because we have to build also the secondary index
+	entries */
+
+	if (!row_upd_store_row(node, trx->mysql_thd,
+			  thr->prebuilt  && thr->prebuilt->table == node->table
+			  ? thr->prebuilt->m_mysql_table : NULL)) {
+		return DB_COMPUTE_VALUE_FAILED;
+	}
+
+	/* Mark the clustered index record deleted; we do not have to check
+	locks, because we assume that we have an x-lock on the record */
+
+	rec = btr_cur_get_rec(btr_cur);
+
+	dberr_t err = btr_cur_del_mark_set_clust_rec(
+		btr_cur_get_block(btr_cur), rec,
+		index, offsets, thr, node->row, mtr);
+
+	if (err != DB_SUCCESS) {
+	} else if (referenced) {
+		/* NOTE that the following call loses the position of pcur ! */
+
+		err = row_upd_check_references_constraints(
+			node, pcur, index->table, index, offsets, thr, mtr);
+#ifdef WITH_WSREP
+	} else if (foreign && wsrep_must_process_fk(node, trx)) {
+		err = wsrep_row_upd_check_foreign_constraints(
+			node, pcur, index->table, index, offsets, thr, mtr);
+
+		switch (err) {
+		case DB_SUCCESS:
+		case DB_NO_REFERENCED_ROW:
+			err = DB_SUCCESS;
+			break;
+		case DB_LOCK_WAIT:
+		case DB_DEADLOCK:
+		case DB_LOCK_WAIT_TIMEOUT:
+			WSREP_DEBUG("Foreign key check fail: "
+				    "%d on table %s index %s query %s",
+				    err, index->name(), index->table->name.m_name,
+				    wsrep_thd_query(trx->mysql_thd));
+			break;
+		default:
+			WSREP_ERROR("Foreign key check fail: "
+				    "%d on table %s index %s query %s",
+				    err, index->name(), index->table->name.m_name,
+				    wsrep_thd_query(trx->mysql_thd));
+			break;
+		}
+#endif /* WITH_WSREP */
+	}
+
+	return(err);
+}
+
+/***********************************************************//**
+Updates the clustered index record.
+@return DB_SUCCESS if operation successfully completed, DB_LOCK_WAIT
+in case of a lock wait, else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_clust_step(
+/*===============*/
+	upd_node_t*	node,	/*!< in: row update node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dict_index_t*	index;
+	btr_pcur_t*	pcur;
+	dberr_t		err;
+	mtr_t		mtr;
+	rec_t*		rec;
+	mem_heap_t*	heap	= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets;
+	ulint		flags;
+	trx_t*		trx = thr_get_trx(thr);
+
+	rec_offs_init(offsets_);
+
+	index = dict_table_get_first_index(node->table);
+
+	if (index->is_corrupted()) {
+		return DB_TABLE_CORRUPT;
+	}
+
+	const bool referenced = row_upd_index_is_referenced(index, trx);
+#ifdef WITH_WSREP
+	const bool foreign = wsrep_row_upd_index_is_foreign(index, trx);
+#endif
+
+	pcur = node->pcur;
+
+	/* We have to restore the cursor to its position */
+
+	mtr.start();
+
+	if (node->table->is_temporary()) {
+		/* Disable locking, because temporary tables are
+		private to the connection (no concurrent access). */
+		flags = node->table->no_rollback()
+			? BTR_NO_ROLLBACK
+			: BTR_NO_LOCKING_FLAG;
+		/* Redo logging only matters for persistent tables. */
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+	} else {
+		flags = node->table->no_rollback() ? BTR_NO_ROLLBACK : 0;
+		index->set_modified(mtr);
+	}
+
+	/* If the restoration does not succeed, then the same
+	transaction has deleted the record on which the cursor was,
+	and that is an SQL error. If the restoration succeeds, it may
+	still be that the same transaction has successively deleted
+	and inserted a record with the same ordering fields, but in
+	that case we know that the transaction has at least an
+	implicit x-lock on the record. */
+
+	ut_a(pcur->rel_pos == BTR_PCUR_ON);
+
+	btr_latch_mode mode;
+
+	DEBUG_SYNC_C_IF_THD(trx->mysql_thd, "innodb_row_upd_clust_step_enter");
+
+	if (dict_index_is_online_ddl(index)) {
+		ut_ad(node->table->id != DICT_INDEXES_ID);
+		mode = BTR_MODIFY_LEAF_ALREADY_LATCHED;
+		mtr_s_lock_index(index, &mtr);
+	} else {
+		mode = BTR_MODIFY_LEAF;
+	}
+
+	if (pcur->restore_position(mode, &mtr) != btr_pcur_t::SAME_ALL) {
+		err = DB_RECORD_NOT_FOUND;
+		goto exit_func;
+	}
+
+	rec = btr_pcur_get_rec(pcur);
+	offsets = rec_get_offsets(rec, index, offsets_, index->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+
+	if (!flags && !node->has_clust_rec_x_lock) {
+		err = lock_clust_rec_modify_check_and_lock(
+			btr_pcur_get_block(pcur),
+			rec, index, offsets, thr);
+		if (err != DB_SUCCESS) {
+			goto exit_func;
+		}
+	}
+
+	ut_ad(index->table->no_rollback() || index->table->is_temporary()
+	      || row_get_rec_trx_id(rec, index, offsets) == trx->id
+	      || lock_trx_has_expl_x_lock(*trx, *index->table,
+					  btr_pcur_get_block(pcur)->page.id(),
+					  page_rec_get_heap_no(rec)));
+
+	if (node->is_delete == PLAIN_DELETE) {
+		err = row_upd_del_mark_clust_rec(
+			node, index, offsets, thr, referenced,
+#ifdef WITH_WSREP
+			foreign,
+#endif
+			&mtr);
+		goto all_done;
+	}
+
+	/* If the update is made for MySQL, we already have the update vector
+	ready, else we have to do some evaluation: */
+
+	if (UNIV_UNLIKELY(!node->in_mysql_interface)) {
+		/* Copy the necessary columns from clust_rec and calculate the
+		new values to set */
+		row_upd_copy_columns(rec, offsets, index,
+				     UT_LIST_GET_FIRST(node->columns));
+		row_upd_eval_new_vals(node->update);
+	}
+
+	if (!node->is_delete && node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+		err = row_upd_clust_rec(
+			flags, node, index, offsets, &heap, thr, &mtr);
+		goto exit_func;
+	}
+
+	if (!row_upd_store_row(node, trx->mysql_thd, thr->prebuilt
+			       ? thr->prebuilt->m_mysql_table : NULL)) {
+		err = DB_COMPUTE_VALUE_FAILED;
+		goto exit_func;
+	}
+
+	if (row_upd_changes_ord_field_binary(index, node->update, thr,
+					     node->row, node->ext)) {
+
+		/* Update causes an ordering field (ordering fields within
+		the B-tree) of the clustered index record to change: perform
+		the update by delete marking and inserting.
+
+		TODO! What to do to the 'Halloween problem', where an update
+		moves the record forward in index so that it is again
+		updated when the cursor arrives there? Solution: the
+		read operation must check the undo record undo number when
+		choosing records to update. MySQL solves now the problem
+		externally! */
+
+		err = row_upd_clust_rec_by_insert(
+			node, index, thr, referenced,
+#ifdef WITH_WSREP
+			foreign,
+#endif
+			&mtr);
+all_done:
+		if (err == DB_SUCCESS) {
+			node->state = UPD_NODE_UPDATE_ALL_SEC;
+success:
+			node->index = dict_table_get_next_index(index);
+		}
+	} else {
+		err = row_upd_clust_rec(
+			flags, node, index, offsets, &heap, thr, &mtr);
+
+		if (err == DB_SUCCESS) {
+			ut_ad(node->is_delete != PLAIN_DELETE);
+			node->state = node->is_delete
+				? UPD_NODE_UPDATE_ALL_SEC
+				: UPD_NODE_UPDATE_SOME_SEC;
+			goto success;
+		}
+	}
+
+exit_func:
+	mtr.commit();
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return err;
+}
+
+/***********************************************************//**
+Updates the affected index records of a row. When the control is transferred
+to this node, we assume that we have a persistent cursor which was on a
+record, and the position of the cursor is stored in the cursor.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+dberr_t
+row_upd(
+/*====*/
+	upd_node_t*	node,	/*!< in: row update node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t		err	= DB_SUCCESS;
+	DBUG_ENTER("row_upd");
+
+	ut_ad(!thr_get_trx(thr)->in_rollback);
+
+	DBUG_PRINT("row_upd", ("table: %s", node->table->name.m_name));
+	DBUG_PRINT("row_upd", ("info bits in update vector: 0x%x",
+			       node->update ? node->update->info_bits: 0));
+	DBUG_PRINT("row_upd", ("foreign_id: %s",
+			       node->foreign ? node->foreign->id: "NULL"));
+
+	if (UNIV_LIKELY(node->in_mysql_interface)) {
+
+		/* We do not get the cmpl_info value from the MySQL
+		interpreter: we must calculate it on the fly: */
+
+		if (node->is_delete == PLAIN_DELETE
+		    || row_upd_changes_some_index_ord_field_binary(
+			    node->table, node->update)) {
+			node->cmpl_info = 0;
+		} else {
+			node->cmpl_info = UPD_NODE_NO_ORD_CHANGE;
+		}
+	}
+
+	switch (node->state) {
+	case UPD_NODE_UPDATE_CLUSTERED:
+	case UPD_NODE_INSERT_CLUSTERED:
+		log_free_check();
+
+		err = row_upd_clust_step(node, thr);
+
+		if (err != DB_SUCCESS) {
+
+			DBUG_RETURN(err);
+		}
+	}
+
+	DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+			    "after_row_upd_clust");
+
+	if (node->index == NULL
+	    || (!node->is_delete
+		&& (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE))) {
+
+		DBUG_RETURN(DB_SUCCESS);
+	}
+
+	DBUG_EXECUTE_IF("row_upd_skip_sec", node->index = NULL;);
+
+	do {
+		if (!node->index) {
+			break;
+		}
+
+		if (!(node->index->type & (DICT_FTS | DICT_CORRUPT))
+		    && node->index->is_committed()) {
+			err = row_upd_sec_step(node, thr);
+
+			if (err != DB_SUCCESS) {
+
+				DBUG_RETURN(err);
+			}
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+	} while (node->index != NULL);
+
+	ut_ad(err == DB_SUCCESS);
+
+	/* Do some cleanup */
+
+	if (node->row != NULL) {
+		node->row = NULL;
+		node->ext = NULL;
+		node->upd_row = NULL;
+		node->upd_ext = NULL;
+		mem_heap_empty(node->heap);
+	}
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	DBUG_RETURN(err);
+}
+
+/***********************************************************//**
+Updates a row in a table. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_upd_step(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	upd_node_t*	node;
+	sel_node_t*	sel_node;
+	que_node_t*	parent;
+	dberr_t		err		= DB_SUCCESS;
+	trx_t*		trx;
+	DBUG_ENTER("row_upd_step");
+
+	ut_ad(thr);
+
+	trx = thr_get_trx(thr);
+
+	node = static_cast<upd_node_t*>(thr->run_node);
+
+	sel_node = node->select;
+
+	parent = que_node_get_parent(node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE);
+
+	if (thr->prev_node == parent) {
+		node->state = UPD_NODE_SET_IX_LOCK;
+	}
+
+	if (node->state == UPD_NODE_SET_IX_LOCK) {
+
+		if (!node->has_clust_rec_x_lock) {
+			/* It may be that the current session has not yet
+			started its transaction, or it has been committed: */
+
+			err = lock_table(node->table, nullptr, LOCK_IX, thr);
+
+			if (err != DB_SUCCESS) {
+
+				goto error_handling;
+			}
+		}
+
+		node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+		if (node->searched_update) {
+			/* Reset the cursor */
+			sel_node->state = SEL_NODE_OPEN;
+
+			/* Fetch a row to update */
+
+			thr->run_node = sel_node;
+
+			DBUG_RETURN(thr);
+		}
+	}
+
+	/* sel_node is NULL if we are in the MySQL interface */
+
+	if (sel_node && (sel_node->state != SEL_NODE_FETCH)) {
+
+		if (!node->searched_update) {
+			/* An explicit cursor should be positioned on a row
+			to update */
+
+			ut_error;
+
+			err = DB_ERROR;
+
+			goto error_handling;
+		}
+
+		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+		/* No more rows to update, or the select node performed the
+		updates directly in-place */
+
+		thr->run_node = parent;
+
+		DBUG_RETURN(thr);
+	}
+
+	/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+	err = row_upd(node, thr);
+
+error_handling:
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		DBUG_RETURN(NULL);
+	}
+
+	/* DO THE TRIGGER ACTIONS HERE */
+
+	if (node->searched_update) {
+		/* Fetch next row to update */
+
+		thr->run_node = sel_node;
+	} else {
+		/* It was an explicit cursor update */
+
+		thr->run_node = parent;
+	}
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	DBUG_RETURN(thr);
+}
+
+/** Write query start time as SQL field data to a buffer. Needed by InnoDB.
+@param	thd	Thread object
+@param	buf	Buffer to hold start time data */
+void thd_get_query_start_data(THD *thd, char *buf);
+
+/** Appends row_start or row_end field to update vector and sets a
+CURRENT_TIMESTAMP/trx->id value to it. Called by vers_make_update() and
+vers_make_delete().
+@param[in]	trx	transaction
+@param[in]	vers_sys_idx	table->row_start or table->row_end */
+void upd_node_t::vers_update_fields(const trx_t *trx, ulint idx)
+{
+  ut_ad(in_mysql_interface); // otherwise needs to recalculate node->cmpl_info
+  ut_ad(idx == table->vers_start || idx == table->vers_end);
+
+  dict_index_t *clust_index= dict_table_get_first_index(table);
+  const dict_col_t *col= dict_table_get_nth_col(table, idx);
+  ulint field_no= dict_col_get_clust_pos(col, clust_index);
+  upd_field_t *ufield;
+
+  for (ulint i= 0; i < update->n_fields; ++i)
+  {
+    if (update->fields[i].field_no == field_no)
+    {
+      ufield= &update->fields[i];
+      goto skip_append;
+    }
+  }
+
+  /* row_create_update_node_for_mysql() pre-allocated this much.
+  At least one PK column always remains unchanged. */
+  ut_ad(update->n_fields < ulint(table->n_cols + table->n_v_cols));
+
+  update->n_fields++;
+  ufield= upd_get_nth_field(update, update->n_fields - 1);
+  upd_field_set_field_no(ufield, static_cast<uint16_t>(field_no), clust_index);
+
+skip_append:
+  char *where= reinterpret_cast<char *>(update->vers_sys_value);
+  if (col->vers_native())
+    mach_write_to_8(where, trx->id);
+  else
+    thd_get_query_start_data(trx->mysql_thd, where);
+
+  dfield_set_data(&ufield->new_val, update->vers_sys_value, col->len);
+
+  for (ulint col_no= 0; col_no < dict_table_get_n_v_cols(table); col_no++)
+  {
+    const dict_v_col_t *v_col= dict_table_get_nth_v_col(table, col_no);
+    if (!v_col->m_col.ord_part)
+      continue;
+    for (ulint i= 0; i < unsigned(v_col->num_base); i++)
+    {
+      dict_col_t *base_col= v_col->base_col[i];
+      if (base_col->ind == col->ind)
+      {
+        /* Virtual column depends on system field value
+        which we updated above. Remove it from update
+        vector, so it is recalculated in
+        row_upd_store_v_row() (see !update branch). */
+        update->remove(v_col->v_pos);
+        break;
+      }
+    }
+  }
+}
+
+
+/** Prepare update vector for versioned delete.
+Set row_end to CURRENT_TIMESTAMP or trx->id.
+Initialize fts_next_doc_id for versioned delete.
+@param[in] trx transaction */
+void upd_node_t::vers_make_delete(trx_t* trx)
+{
+  update->n_fields= 0;
+  is_delete= VERSIONED_DELETE;
+  vers_update_fields(trx, table->vers_end);
+  trx->fts_next_doc_id= table->fts ? UINT64_UNDEFINED : 0;
+}
diff --git a/storage/innobase/row/row0vers.cc b/storage/innobase/row/row0vers.cc
new file mode 100644
index 00000000..c3acf325
--- /dev/null
+++ b/storage/innobase/row/row0vers.cc
@@ -0,0 +1,1419 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0vers.cc
+Row versions
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0vers.h"
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "row0mysql.h"
+
+/** Check whether all non-virtual index fields are equal.
+@param[in]	index	the secondary index
+@param[in]	a	first index entry to compare
+@param[in]	b	second index entry to compare
+@return	whether all non-virtual fields are equal */
+static
+bool
+row_vers_non_virtual_fields_equal(
+	const dict_index_t*	index,
+	const dfield_t*		a,
+	const dfield_t*		b)
+{
+	const dict_field_t* end = &index->fields[index->n_fields];
+
+	for (const dict_field_t* ifield = index->fields; ifield != end;
+	     ifield++) {
+		if (!ifield->col->is_virtual()
+		    && cmp_dfield_dfield(a++, b++)) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/** Determine if an active transaction has inserted or modified a secondary
+index record.
+@param[in,out]	caller_trx	trx of current thread
+@param[in]	clust_rec	clustered index record
+@param[in]	clust_index	clustered index
+@param[in]	rec		secondary index record
+@param[in]	index		secondary index
+@param[in]	offsets		rec_get_offsets(rec, index)
+@param[in,out]	mtr		mini-transaction
+@return	the active transaction; state must be rechecked after
+acquiring trx->mutex, and trx->release_reference() must be invoked
+@retval	NULL if the record was committed */
+UNIV_INLINE
+trx_t*
+row_vers_impl_x_locked_low(
+	trx_t*		caller_trx,
+	const rec_t*	clust_rec,
+	dict_index_t*	clust_index,
+	const rec_t*	rec,
+	dict_index_t*	index,
+	const rec_offs*	offsets,
+	mtr_t*		mtr)
+{
+	trx_id_t	trx_id;
+	rec_t*		prev_version = NULL;
+	rec_offs	clust_offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	clust_offsets;
+	mem_heap_t*	heap;
+	dtuple_t*	ientry = NULL;
+	mem_heap_t*	v_heap = NULL;
+	dtuple_t*	cur_vrow = NULL;
+
+	rec_offs_init(clust_offsets_);
+
+	DBUG_ENTER("row_vers_impl_x_locked_low");
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(mtr->memo_contains_page_flagged(clust_rec,
+					      MTR_MEMO_PAGE_S_FIX
+					      | MTR_MEMO_PAGE_X_FIX));
+
+	if (ulint trx_id_offset = clust_index->trx_id_offset) {
+		trx_id = mach_read_from_6(clust_rec + trx_id_offset);
+		if (trx_id == 0) {
+			/* The transaction history was already purged. */
+			DBUG_RETURN(0);
+		}
+	}
+
+	heap = mem_heap_create(1024);
+
+	clust_offsets = rec_get_offsets(clust_rec, clust_index, clust_offsets_,
+					clust_index->n_core_fields,
+					ULINT_UNDEFINED, &heap);
+
+	trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets);
+	if (trx_id == 0) {
+		/* The transaction history was already purged. */
+		mem_heap_free(heap);
+		DBUG_RETURN(0);
+	}
+
+	ut_ad(!clust_index->table->is_temporary());
+
+	trx_t*	trx;
+
+	if (trx_id == caller_trx->id) {
+		trx = caller_trx;
+		trx->reference();
+	} else {
+		trx = trx_sys.find(caller_trx, trx_id);
+		if (trx == 0) {
+			/* The transaction that modified or inserted
+			clust_rec is no longer active, or it is
+			corrupt: no implicit lock on rec */
+			lock_check_trx_id_sanity(trx_id, clust_rec,
+						 clust_index, clust_offsets);
+			mem_heap_free(heap);
+			DBUG_RETURN(0);
+		}
+	}
+
+	const ulint comp = page_rec_is_comp(rec);
+	ut_ad(index->table == clust_index->table);
+	ut_ad(!!comp == dict_table_is_comp(index->table));
+	ut_ad(!comp == !page_rec_is_comp(clust_rec));
+
+	const ulint rec_del = rec_get_deleted_flag(rec, comp);
+
+	if (dict_index_has_virtual(index)) {
+		ulint	est_size = DTUPLE_EST_ALLOC(index->n_fields);
+
+		/* Allocate the dtuple for virtual columns extracted from undo
+		log with its own heap, so to avoid it being freed as we
+		iterating in the version loop below. */
+		v_heap = mem_heap_create(est_size);
+		ientry = row_rec_to_index_entry(rec, index, offsets, v_heap);
+	}
+
+	/* We look up if some earlier version, which was modified by
+	the trx_id transaction, of the clustered index record would
+	require rec to be in a different state (delete marked or
+	unmarked, or have different field values, or not existing). If
+	there is such a version, then rec was modified by the trx_id
+	transaction, and it has an implicit x-lock on rec. Note that
+	if clust_rec itself would require rec to be in a different
+	state, then the trx_id transaction has not yet had time to
+	modify rec, and does not necessarily have an implicit x-lock
+	on rec. */
+
+	for (const rec_t* version = clust_rec;; version = prev_version) {
+		row_ext_t*	ext;
+		dtuple_t*	row;
+		dtuple_t*	entry;
+		ulint		vers_del;
+		trx_id_t	prev_trx_id;
+		mem_heap_t*	old_heap = heap;
+		dtuple_t*	vrow = NULL;
+
+		/* We keep the semaphore in mtr on the clust_rec page, so
+		that no other transaction can update it and get an
+		implicit x-lock on rec until mtr_commit(mtr). */
+
+		heap = mem_heap_create(1024);
+
+		trx_undo_prev_version_build(
+			version, clust_index, clust_offsets,
+			heap, &prev_version, NULL,
+			dict_index_has_virtual(index) ? &vrow : NULL, 0);
+
+		ut_d(trx->mutex_lock());
+		const bool committed = trx_state_eq(
+			trx, TRX_STATE_COMMITTED_IN_MEMORY);
+		ut_d(trx->mutex_unlock());
+
+		/* The oldest visible clustered index version must not be
+		delete-marked, because we never start a transaction by
+		inserting a delete-marked record. */
+		ut_ad(committed || prev_version
+		      || !rec_get_deleted_flag(version, comp));
+
+		/* Free version and clust_offsets. */
+		mem_heap_free(old_heap);
+
+		if (committed) {
+			goto not_locked;
+		}
+
+		if (prev_version == NULL) {
+
+			/* We reached the oldest visible version without
+			finding an older version of clust_rec that would
+			match the secondary index record.  If the secondary
+			index record is not delete marked, then clust_rec
+			is considered the correct match of the secondary
+			index record and hence holds the implicit lock. */
+
+			if (rec_del) {
+				/* The secondary index record is del marked.
+				So, the implicit lock holder of clust_rec
+				did not modify the secondary index record yet,
+				and is not holding an implicit lock on it.
+
+				This assumes that whenever a row is inserted
+				or updated, the leaf page record always is
+				created with a clear delete-mark flag.
+				(We never insert a delete-marked record.) */
+not_locked:
+				trx->release_reference();
+				trx = 0;
+			}
+
+			break;
+		}
+
+		clust_offsets = rec_get_offsets(
+			prev_version, clust_index, clust_offsets_,
+			clust_index->n_core_fields,
+			ULINT_UNDEFINED, &heap);
+
+		vers_del = rec_get_deleted_flag(prev_version, comp);
+
+		prev_trx_id = row_get_rec_trx_id(prev_version, clust_index,
+						 clust_offsets);
+
+		/* The stack of versions is locked by mtr.  Thus, it
+		is safe to fetch the prefixes for externally stored
+		columns. */
+
+		row = row_build(ROW_COPY_POINTERS, clust_index, prev_version,
+				clust_offsets,
+				NULL, NULL, NULL, &ext, heap);
+
+		if (dict_index_has_virtual(index)) {
+			if (vrow) {
+				/* Keep the virtual row info for the next
+				version */
+				cur_vrow = dtuple_copy(vrow, v_heap);
+				dtuple_dup_v_fld(cur_vrow, v_heap);
+			}
+
+			if (!cur_vrow) {
+				/* Build index entry out of row */
+				entry = row_build_index_entry(row, ext, index,
+							      heap);
+
+				/* entry could only be NULL (the
+				clustered index record could contain
+				BLOB pointers that are NULL) if we
+				were accessing a freshly inserted
+				record before it was fully inserted.
+				prev_version cannot possibly be such
+				an incomplete record, because its
+				transaction would have to be committed
+				in order for later versions of the
+				record to be able to exist. */
+				ut_ad(entry);
+
+				/* If the indexed virtual columns has changed,
+				there must be log record to generate vrow.
+				Otherwise, it is not changed, so no need
+				to compare */
+				if (!row_vers_non_virtual_fields_equal(
+					    index,
+					    ientry->fields, entry->fields)) {
+					if (rec_del != vers_del) {
+						break;
+					}
+				} else if (!rec_del) {
+					break;
+				}
+
+				goto result_check;
+			} else {
+				ut_ad(row->n_v_fields == cur_vrow->n_v_fields);
+				dtuple_copy_v_fields(row, cur_vrow);
+			}
+		}
+
+		entry = row_build_index_entry(row, ext, index, heap);
+
+		/* entry could only be NULL (the clustered index
+		record could contain BLOB pointers that are NULL) if
+		we were accessing a freshly inserted record before it
+		was fully inserted.  prev_version cannot possibly be
+		such an incomplete record, because its transaction
+		would have to be committed in order for later versions
+		of the record to be able to exist. */
+		ut_ad(entry);
+
+		/* If we get here, we know that the trx_id transaction
+		modified prev_version. Let us check if prev_version
+		would require rec to be in a different state. */
+
+		/* The previous version of clust_rec must be
+		accessible, because clust_rec was not a fresh insert.
+		There is no guarantee that the transaction is still
+		active. */
+
+		/* We check if entry and rec are identified in the alphabetical
+		ordering */
+		if (0 == cmp_dtuple_rec(entry, rec, index, offsets)) {
+			/* The delete marks of rec and prev_version should be
+			equal for rec to be in the state required by
+			prev_version */
+
+			if (rec_del != vers_del) {
+
+				break;
+			}
+
+			/* It is possible that the row was updated so that the
+			secondary index record remained the same in
+			alphabetical ordering, but the field values changed
+			still. For example, 'abc' -> 'ABC'. Check also that. */
+
+			dtuple_set_types_binary(
+				entry, dtuple_get_n_fields(entry));
+
+			if (cmp_dtuple_rec(entry, rec, index, offsets)) {
+
+				break;
+			}
+
+		} else if (!rec_del) {
+			/* The delete mark should be set in rec for it to be
+			in the state required by prev_version */
+
+			break;
+		}
+
+result_check:
+		if (trx->id != prev_trx_id) {
+			/* prev_version was the first version modified by
+			the trx_id transaction: no implicit x-lock */
+			goto not_locked;
+		}
+	}
+
+	if (trx) {
+		DBUG_PRINT("info", ("Implicit lock is held by trx:" TRX_ID_FMT,
+				    trx_id));
+	}
+
+	if (v_heap != NULL) {
+		mem_heap_free(v_heap);
+	}
+
+	mem_heap_free(heap);
+	DBUG_RETURN(trx);
+}
+
+/** Determine if an active transaction has inserted or modified a secondary
+index record.
+@param[in,out]	caller_trx	trx of current thread
+@param[in]	rec	secondary index record
+@param[in]	index	secondary index
+@param[in]	offsets	rec_get_offsets(rec, index)
+@return	the active transaction; state must be rechecked after
+acquiring trx->mutex, and trx->release_reference() must be invoked
+@retval	NULL if the record was committed */
+trx_t*
+row_vers_impl_x_locked(
+	trx_t*		caller_trx,
+	const rec_t*	rec,
+	dict_index_t*	index,
+	const rec_offs*	offsets)
+{
+	mtr_t		mtr;
+	trx_t*		trx;
+	const rec_t*	clust_rec;
+	dict_index_t*	clust_index;
+
+	lock_sys.assert_unlocked();
+
+	mtr_start(&mtr);
+
+	/* Search for the clustered index record. The latch on the
+	page of clust_rec locks the top of the stack of versions. The
+	bottom of the version stack is not locked; oldest versions may
+	disappear by the fact that transactions may be committed and
+	collected by the purge. This is not a problem, because we are
+	only interested in active transactions. */
+
+	clust_rec = row_get_clust_rec(
+		BTR_SEARCH_LEAF, rec, index, &clust_index, &mtr);
+
+	if (!clust_rec) {
+		/* In a rare case it is possible that no clust rec is found
+		for a secondary index record: if in row0umod.cc
+		row_undo_mod_remove_clust_low() we have already removed the
+		clust rec, while purge is still cleaning and removing
+		secondary index records associated with earlier versions of
+		the clustered index record. In that case there cannot be
+		any implicit lock on the secondary index record, because
+		an active transaction which has modified the secondary index
+		record has also modified the clustered index record. And in
+		a rollback we always undo the modifications to secondary index
+		records before the clustered index record. */
+
+		trx = 0;
+	} else {
+		trx = row_vers_impl_x_locked_low(
+				caller_trx, clust_rec, clust_index, rec, index,
+				offsets, &mtr);
+
+		ut_ad(trx == 0 || trx->is_referenced());
+	}
+
+	mtr_commit(&mtr);
+
+	return(trx);
+}
+
+/** build virtual column value from current cluster index record data
+@param[in,out]	row		the cluster index row in dtuple form
+@param[in]	clust_index	clustered index
+@param[in]	index		the secondary index
+@param[in]	heap		heap used to build virtual dtuple. */
+static
+bool
+row_vers_build_clust_v_col(
+	dtuple_t*		row,
+	dict_index_t*		clust_index,
+	dict_index_t*		index,
+	mem_heap_t*		heap)
+{
+	THD*		thd= current_thd;
+	TABLE*		maria_table= 0;
+
+	ut_ad(dict_index_has_virtual(index));
+	ut_ad(index->table == clust_index->table);
+
+	DEBUG_SYNC(current_thd, "ib_clust_v_col_before_row_allocated");
+
+	ib_vcol_row vc(nullptr);
+	byte *record = vc.record(thd, index, &maria_table);
+
+	ut_ad(maria_table);
+
+	for (ulint i = 0; i < dict_index_get_n_fields(index); i++) {
+		const dict_col_t* c = dict_index_get_nth_col(index, i);
+
+		if (c->is_virtual()) {
+			const dict_v_col_t* col
+				= reinterpret_cast<const dict_v_col_t*>(c);
+
+			dfield_t *vfield = innobase_get_computed_value(
+				row, col, clust_index, &vc.heap,
+				heap, NULL, thd, maria_table, record, NULL,
+				NULL);
+			if (!vfield) {
+				innobase_report_computed_value_failed(row);
+				ut_ad(0);
+				return false;
+			}
+		}
+	}
+
+	return true;
+}
+
+/** Build latest virtual column data from undo log
+@param[in]	in_purge	whether this is the purge thread
+@param[in]	rec		clustered index record
+@param[in]	clust_index	clustered index
+@param[in,out]	clust_offsets	offsets on the clustered index record
+@param[in]	index		the secondary index
+@param[in]	roll_ptr	the rollback pointer for the purging record
+@param[in]	trx_id		trx id for the purging record
+@param[in,out]	v_heap		heap used to build vrow
+@param[out]	v_row		dtuple holding the virtual rows
+@param[in,out]	mtr		mtr holding the latch on rec */
+static
+void
+row_vers_build_cur_vrow_low(
+	bool			in_purge,
+	const rec_t*		rec,
+	dict_index_t*		clust_index,
+	rec_offs*		clust_offsets,
+	dict_index_t*		index,
+	roll_ptr_t		roll_ptr,
+	trx_id_t		trx_id,
+	mem_heap_t*		v_heap,
+	dtuple_t**		vrow,
+	mtr_t*			mtr)
+{
+	const rec_t*	version;
+	rec_t*		prev_version;
+	mem_heap_t*	heap = NULL;
+	ulint		num_v = dict_table_get_n_v_cols(index->table);
+	const dfield_t* field;
+	ulint		i;
+	bool		all_filled = false;
+
+	*vrow = dtuple_create_with_vcol(v_heap, 0, num_v);
+	dtuple_init_v_fld(*vrow);
+
+	for (i = 0; i < num_v; i++) {
+		dfield_get_type(dtuple_get_nth_v_field(*vrow, i))->mtype
+			 = DATA_MISSING;
+	}
+
+	ut_ad(mtr->memo_contains_page_flagged(rec,
+					      MTR_MEMO_PAGE_S_FIX
+					      | MTR_MEMO_PAGE_X_FIX));
+
+	version = rec;
+
+	/* If this is called by purge thread, set TRX_UNDO_PREV_IN_PURGE
+	bit to search the undo log until we hit the current undo log with
+	roll_ptr */
+	const ulint	status = in_purge
+		? TRX_UNDO_PREV_IN_PURGE | TRX_UNDO_GET_OLD_V_VALUE
+		: TRX_UNDO_GET_OLD_V_VALUE;
+
+	while (!all_filled) {
+		mem_heap_t*	heap2 = heap;
+		heap = mem_heap_create(1024);
+		roll_ptr_t	cur_roll_ptr = row_get_rec_roll_ptr(
+			version, clust_index, clust_offsets);
+
+		trx_undo_prev_version_build(
+			version, clust_index, clust_offsets,
+			heap, &prev_version, NULL, vrow, status);
+
+		if (heap2) {
+			mem_heap_free(heap2);
+		}
+
+		if (!prev_version) {
+			/* Versions end here */
+			break;
+		}
+
+		clust_offsets = rec_get_offsets(prev_version, clust_index,
+						NULL,
+						clust_index->n_core_fields,
+						ULINT_UNDEFINED, &heap);
+
+		ulint	entry_len = dict_index_get_n_fields(index);
+
+		all_filled = true;
+
+		for (i = 0; i < entry_len; i++) {
+			const dict_col_t* col
+				= dict_index_get_nth_col(index, i);
+
+			if (!col->is_virtual()) {
+				continue;
+			}
+
+			const dict_v_col_t*	v_col
+				= reinterpret_cast<const dict_v_col_t*>(col);
+			field = dtuple_get_nth_v_field(*vrow, v_col->v_pos);
+
+			if (dfield_get_type(field)->mtype == DATA_MISSING) {
+				all_filled = false;
+				break;
+			}
+
+		}
+
+		trx_id_t	rec_trx_id = row_get_rec_trx_id(
+			prev_version, clust_index, clust_offsets);
+
+		if (rec_trx_id < trx_id || roll_ptr == cur_roll_ptr) {
+			break;
+		}
+
+		version = prev_version;
+	}
+
+	mem_heap_free(heap);
+}
+
+/** Check a virtual column value index secondary virtual index matches
+that of current cluster index record, which is recreated from information
+stored in undo log
+@param[in]	rec		record in the clustered index
+@param[in]	icentry		the index entry built from a cluster row
+@param[in]	clust_index	cluster index
+@param[in]	clust_offsets	offsets on the cluster record
+@param[in]	index		the secondary index
+@param[in]	ientry		the secondary index entry
+@param[in]	roll_ptr	the rollback pointer for the purging record
+@param[in]	trx_id		trx id for the purging record
+@param[in,out]	v_heap		heap used to build virtual dtuple
+@param[in,out]	v_row		dtuple holding the virtual rows (if needed)
+@param[in]	mtr		mtr holding the latch on rec
+@return true if matches, false otherwise */
+static
+bool
+row_vers_vc_matches_cluster(
+	const rec_t*	rec,
+	const dtuple_t* icentry,
+	dict_index_t*	clust_index,
+	rec_offs*	clust_offsets,
+	dict_index_t*	index,
+	const dtuple_t* ientry,
+	roll_ptr_t	roll_ptr,
+	trx_id_t	trx_id,
+	mem_heap_t*	v_heap,
+	dtuple_t**	vrow,
+	mtr_t*		mtr)
+{
+	const rec_t*	version;
+	rec_t*          prev_version;
+	mem_heap_t*	heap2;
+	mem_heap_t*	heap = NULL;
+	mem_heap_t*	tuple_heap;
+	ulint		num_v = dict_table_get_n_v_cols(index->table);
+	bool		compare[REC_MAX_N_FIELDS];
+	ulint		n_fields = dtuple_get_n_fields(ientry);
+	ulint		n_non_v_col = 0;
+	ulint		n_cmp_v_col = 0;
+	const dfield_t* field1;
+	dfield_t*	field2;
+	ulint		i;
+
+	/* First compare non-virtual columns (primary keys) */
+	ut_ad(index->n_fields == n_fields);
+	ut_ad(n_fields == dtuple_get_n_fields(icentry));
+	ut_ad(mtr->memo_contains_page_flagged(rec,
+					      MTR_MEMO_PAGE_S_FIX
+					      | MTR_MEMO_PAGE_X_FIX));
+
+	{
+		const dfield_t* a = ientry->fields;
+		const dfield_t* b = icentry->fields;
+
+		for (const dict_field_t *ifield = index->fields,
+			     *const end = &index->fields[index->n_fields];
+		     ifield != end; ifield++, a++, b++) {
+			if (!ifield->col->is_virtual()) {
+				if (cmp_dfield_dfield(a, b)) {
+					return false;
+				}
+				n_non_v_col++;
+			}
+		}
+	}
+
+	tuple_heap = mem_heap_create(1024);
+
+	ut_ad(n_fields > n_non_v_col);
+
+	*vrow = dtuple_create_with_vcol(v_heap ? v_heap : tuple_heap, 0, num_v);
+	dtuple_init_v_fld(*vrow);
+
+	for (i = 0; i < num_v; i++) {
+		dfield_get_type(dtuple_get_nth_v_field(*vrow, i))->mtype
+			 = DATA_MISSING;
+		compare[i] = false;
+	}
+
+	version = rec;
+
+	while (n_cmp_v_col < n_fields - n_non_v_col) {
+		heap2 = heap;
+		heap = mem_heap_create(1024);
+		roll_ptr_t	cur_roll_ptr = row_get_rec_roll_ptr(
+			version, clust_index, clust_offsets);
+
+		ut_ad(cur_roll_ptr != 0);
+		ut_ad(roll_ptr != 0);
+
+		trx_undo_prev_version_build(
+			version, clust_index, clust_offsets,
+			heap, &prev_version, NULL, vrow,
+			TRX_UNDO_PREV_IN_PURGE | TRX_UNDO_GET_OLD_V_VALUE);
+
+		if (heap2) {
+			mem_heap_free(heap2);
+		}
+
+		if (!prev_version) {
+			/* Versions end here */
+			goto func_exit;
+		}
+
+		clust_offsets = rec_get_offsets(prev_version, clust_index,
+						NULL,
+						clust_index->n_core_fields,
+						ULINT_UNDEFINED, &heap);
+
+		ulint	entry_len = dict_index_get_n_fields(index);
+
+		for (i = 0; i < entry_len; i++) {
+			const dict_field_t*	ind_field
+				 = dict_index_get_nth_field(index, i);
+			const dict_col_t*	col = ind_field->col;
+			field1 = dtuple_get_nth_field(ientry, i);
+
+			if (!col->is_virtual()) {
+				continue;
+			}
+
+			const dict_v_col_t*     v_col
+                                = reinterpret_cast<const dict_v_col_t*>(col);
+			field2
+				= dtuple_get_nth_v_field(*vrow, v_col->v_pos);
+
+			if ((dfield_get_type(field2)->mtype != DATA_MISSING)
+			    && (!compare[v_col->v_pos])) {
+
+				if (ind_field->prefix_len != 0
+				    && !dfield_is_null(field2)) {
+					field2->len = unsigned(
+						dtype_get_at_most_n_mbchars(
+							field2->type.prtype,
+							field2->type.mbminlen,
+							field2->type.mbmaxlen,
+							ind_field->prefix_len,
+							field2->len,
+							static_cast<char*>
+							(field2->data)));
+				}
+
+				/* The index field mismatch */
+				if (v_heap
+				    || cmp_dfield_dfield(field2, field1)) {
+					if (v_heap) {
+						dtuple_dup_v_fld(*vrow, v_heap);
+					}
+
+					mem_heap_free(tuple_heap);
+					mem_heap_free(heap);
+					return(false);
+				}
+
+				compare[v_col->v_pos] = true;
+				n_cmp_v_col++;
+			}
+		}
+
+		trx_id_t	rec_trx_id = row_get_rec_trx_id(
+			prev_version, clust_index, clust_offsets);
+
+		if (rec_trx_id < trx_id || roll_ptr == cur_roll_ptr) {
+			break;
+		}
+
+		version = prev_version;
+	}
+
+func_exit:
+	if (n_cmp_v_col == 0) {
+		*vrow = NULL;
+	}
+
+	mem_heap_free(tuple_heap);
+	mem_heap_free(heap);
+
+	/* FIXME: In the case of n_cmp_v_col is not the same as
+	n_fields - n_non_v_col, callback is needed to compare the rest
+	columns. At the timebeing, we will need to return true */
+	return (true);
+}
+
+/** Build a dtuple contains virtual column data for current cluster index
+@param[in]	in_purge	called by purge thread
+@param[in]	rec		cluster index rec
+@param[in]	clust_index	cluster index
+@param[in]	clust_offsets	cluster rec offset
+@param[in]	index		secondary index
+@param[in]	roll_ptr	roll_ptr for the purge record
+@param[in]	trx_id		transaction ID on the purging record
+@param[in,out]	heap		heap memory
+@param[in,out]	v_heap		heap memory to keep virtual colum dtuple
+@param[in]	mtr		mtr holding the latch on rec
+@return dtuple contains virtual column data */
+static
+dtuple_t*
+row_vers_build_cur_vrow(
+	bool			in_purge,
+	const rec_t*		rec,
+	dict_index_t*		clust_index,
+	rec_offs**		clust_offsets,
+	dict_index_t*		index,
+	roll_ptr_t		roll_ptr,
+	trx_id_t		trx_id,
+	mem_heap_t*		heap,
+	mem_heap_t*		v_heap,
+	mtr_t*			mtr)
+{
+	dtuple_t* cur_vrow = NULL;
+
+	roll_ptr_t t_roll_ptr = row_get_rec_roll_ptr(
+		rec, clust_index, *clust_offsets);
+
+	/* if the row is newly inserted, then the virtual
+	columns need to be computed */
+	if (trx_undo_roll_ptr_is_insert(t_roll_ptr)) {
+
+		ut_ad(!rec_get_deleted_flag(rec, page_rec_is_comp(rec)));
+
+		/* This is a newly inserted record and cannot
+		be deleted, So the externally stored field
+		cannot be freed yet. */
+		dtuple_t* row = row_build(ROW_COPY_POINTERS, clust_index,
+					  rec, *clust_offsets,
+					  NULL, NULL, NULL, NULL, heap);
+
+		if (!row_vers_build_clust_v_col(row, clust_index, index,
+						heap)) {
+			return nullptr;
+		}
+
+		cur_vrow = dtuple_copy(row, v_heap);
+		dtuple_dup_v_fld(cur_vrow, v_heap);
+	} else {
+		/* Try to fetch virtual column data from undo log */
+		row_vers_build_cur_vrow_low(
+			in_purge, rec, clust_index, *clust_offsets,
+			index, roll_ptr, trx_id, v_heap, &cur_vrow, mtr);
+	}
+
+	*clust_offsets = rec_get_offsets(rec, clust_index, NULL,
+					 clust_index->n_core_fields,
+					 ULINT_UNDEFINED, &heap);
+	return(cur_vrow);
+}
+
+/** @return whether two data tuples are equal */
+static bool dtuple_coll_eq(const dtuple_t &tuple1, const dtuple_t &tuple2)
+{
+  ut_ad(tuple1.magic_n == DATA_TUPLE_MAGIC_N);
+  ut_ad(tuple2.magic_n == DATA_TUPLE_MAGIC_N);
+  ut_ad(dtuple_check_typed(&tuple1));
+  ut_ad(dtuple_check_typed(&tuple2));
+  ut_ad(tuple1.n_fields == tuple2.n_fields);
+
+  for (ulint i= 0; i < tuple1.n_fields; i++)
+    if (cmp_dfield_dfield(&tuple1.fields[i], &tuple2.fields[i]))
+      return false;
+  return true;
+}
+
+/** Find out whether data tuple has missing data type
+for indexed virtual column.
+@param tuple   data tuple
+@param index   virtual index
+@return true if tuple has missing column type */
+static bool dtuple_vcol_data_missing(const dtuple_t &tuple,
+                                     dict_index_t *index)
+{
+  for (ulint i= 0; i < index->n_uniq; i++)
+  {
+    dict_col_t *col= index->fields[i].col;
+    if (!col->is_virtual())
+      continue;
+    dict_v_col_t *vcol= reinterpret_cast<dict_v_col_t*>(col);
+    for (ulint j= 0; j < index->table->n_v_cols; j++)
+    {
+      if (vcol == &index->table->v_cols[j]
+          && tuple.v_fields[j].type.mtype == DATA_MISSING)
+        return true;
+    }
+  }
+  return false;
+}
+
+/** Finds out if a version of the record, where the version >= the current
+purge_sys.view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry == ientry; exactly in
+this case we return TRUE.
+@param[in]	also_curr	TRUE if also rec is included in the versions
+				to search; otherwise only versions prior
+				to it are searched
+@param[in]	rec		record in the clustered index; the caller
+				must have a latch on the page
+@param[in]	mtr		mtr holding the latch on rec; it will
+				also hold the latch on purge_view
+@param[in]	index		secondary index
+@param[in]	ientry		secondary index entry
+@param[in]	roll_ptr	roll_ptr for the purge record
+@param[in]	trx_id		transaction ID on the purging record
+@return TRUE if earlier version should have */
+bool
+row_vers_old_has_index_entry(
+	bool			also_curr,
+	const rec_t*		rec,
+	mtr_t*			mtr,
+	dict_index_t*		index,
+	const dtuple_t*		ientry,
+	roll_ptr_t		roll_ptr,
+	trx_id_t		trx_id)
+{
+	const rec_t*	version;
+	rec_t*		prev_version;
+	dict_index_t*	clust_index;
+	rec_offs*	clust_offsets;
+	mem_heap_t*	heap;
+	mem_heap_t*	heap2;
+	dtuple_t*	row;
+	const dtuple_t*	entry;
+	ulint		comp;
+	dtuple_t*	vrow = NULL;
+	mem_heap_t*	v_heap = NULL;
+	dtuple_t*	cur_vrow = NULL;
+
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
+	clust_index = dict_table_get_first_index(index->table);
+
+	comp = page_rec_is_comp(rec);
+	ut_ad(!dict_table_is_comp(index->table) == !comp);
+	heap = mem_heap_create(1024);
+	clust_offsets = rec_get_offsets(rec, clust_index, NULL,
+					clust_index->n_core_fields,
+					ULINT_UNDEFINED, &heap);
+
+	if (dict_index_has_virtual(index)) {
+		v_heap = mem_heap_create(100);
+	}
+
+	DBUG_EXECUTE_IF("ib_purge_virtual_index_crash",
+			DBUG_SUICIDE(););
+
+	if (also_curr && !rec_get_deleted_flag(rec, comp)) {
+		row_ext_t*	ext;
+
+		/* The top of the stack of versions is locked by the
+		mtr holding a latch on the page containing the
+		clustered index record. The bottom of the stack is
+		locked by the fact that the purge_sys.view must
+		'overtake' any read view of an active transaction.
+		Thus, it is safe to fetch the prefixes for
+		externally stored columns. */
+		row = row_build(ROW_COPY_POINTERS, clust_index,
+				rec, clust_offsets,
+				NULL, NULL, NULL, &ext, heap);
+
+		if (dict_index_has_virtual(index)) {
+
+
+#ifdef DBUG_OFF
+# define dbug_v_purge false
+#else /* DBUG_OFF */
+                        bool    dbug_v_purge = false;
+#endif /* DBUG_OFF */
+
+			DBUG_EXECUTE_IF(
+				"ib_purge_virtual_index_callback",
+				dbug_v_purge = true;);
+
+			roll_ptr_t t_roll_ptr = row_get_rec_roll_ptr(
+				rec, clust_index, clust_offsets);
+
+			/* if the row is newly inserted, then the virtual
+			columns need to be computed */
+			if (trx_undo_roll_ptr_is_insert(t_roll_ptr)
+			    || dbug_v_purge) {
+
+				if (!row_vers_build_clust_v_col(
+					    row, clust_index, index, heap)) {
+					goto unsafe_to_purge;
+				}
+
+				entry = row_build_index_entry(
+					row, ext, index, heap);
+				if (entry && dtuple_coll_eq(*ientry, *entry)) {
+					goto unsafe_to_purge;
+				}
+			} else {
+				/* Build index entry out of row */
+				entry = row_build_index_entry(row, ext, index, heap);
+				/* entry could only be NULL if
+				the clustered index record is an uncommitted
+				inserted record whose BLOBs have not been
+				written yet. The secondary index record
+				can be safely removed, because it cannot
+				possibly refer to this incomplete
+				clustered index record. (Insert would
+				always first be completed for the
+				clustered index record, then proceed to
+				secondary indexes.) */
+
+				if (entry && row_vers_vc_matches_cluster(
+					    rec, entry,
+					    clust_index, clust_offsets,
+					    index, ientry, roll_ptr,
+					    trx_id, NULL, &vrow, mtr)) {
+					goto unsafe_to_purge;
+				}
+			}
+			clust_offsets = rec_get_offsets(rec, clust_index, NULL,
+							clust_index
+							->n_core_fields,
+							ULINT_UNDEFINED, &heap);
+		} else {
+
+			entry = row_build_index_entry(
+				row, ext, index, heap);
+
+			/* If entry == NULL, the record contains unset BLOB
+			pointers.  This must be a freshly inserted record.  If
+			this is called from
+			row_purge_remove_sec_if_poss_low(), the thread will
+			hold latches on the clustered index and the secondary
+			index.  Because the insert works in three steps:
+
+				(1) insert the record to clustered index
+				(2) store the BLOBs and update BLOB pointers
+				(3) insert records to secondary indexes
+
+			the purge thread can safely ignore freshly inserted
+			records and delete the secondary index record.  The
+			thread that inserted the new record will be inserting
+			the secondary index records. */
+
+			/* NOTE that we cannot do the comparison as binary
+			fields because the row is maybe being modified so that
+			the clustered index record has already been updated to
+			a different binary value in a char field, but the
+			collation identifies the old and new value anyway! */
+			if (entry && dtuple_coll_eq(*ientry, *entry)) {
+unsafe_to_purge:
+				mem_heap_free(heap);
+
+				if (v_heap) {
+					mem_heap_free(v_heap);
+				}
+				return true;
+			}
+		}
+	} else if (dict_index_has_virtual(index)) {
+		/* The current cluster index record could be
+		deleted, but the previous version of it might not. We will
+		need to get the virtual column data from undo record
+		associated with current cluster index */
+
+		cur_vrow = row_vers_build_cur_vrow(
+			also_curr, rec, clust_index, &clust_offsets,
+			index, roll_ptr, trx_id, heap, v_heap, mtr);
+	}
+
+	version = rec;
+
+	for (;;) {
+		heap2 = heap;
+		heap = mem_heap_create(1024);
+		vrow = NULL;
+
+		trx_undo_prev_version_build(version,
+					    clust_index, clust_offsets,
+					    heap, &prev_version, nullptr,
+					    dict_index_has_virtual(index)
+					    ? &vrow : nullptr,
+					    TRX_UNDO_CHECK_PURGEABILITY);
+		mem_heap_free(heap2); /* free version and clust_offsets */
+
+		if (!prev_version) {
+			/* Versions end here */
+			mem_heap_free(heap);
+
+			if (v_heap) {
+				mem_heap_free(v_heap);
+			}
+
+			return false;
+		}
+
+		clust_offsets = rec_get_offsets(prev_version, clust_index,
+						NULL,
+						clust_index->n_core_fields,
+						ULINT_UNDEFINED, &heap);
+
+		if (dict_index_has_virtual(index)) {
+			if (vrow) {
+				if (dtuple_vcol_data_missing(*vrow, index)) {
+					goto nochange_index;
+				}
+				/* Keep the virtual row info for the next
+				version, unless it is changed */
+				mem_heap_empty(v_heap);
+				cur_vrow = dtuple_copy(vrow, v_heap);
+				dtuple_dup_v_fld(cur_vrow, v_heap);
+			}
+
+			if (!cur_vrow) {
+				/* Nothing for this index has changed,
+				continue */
+nochange_index:
+				version = prev_version;
+				continue;
+			}
+		}
+
+		if (!rec_get_deleted_flag(prev_version, comp)) {
+			row_ext_t*	ext;
+
+			/* The stack of versions is locked by mtr.
+			Thus, it is safe to fetch the prefixes for
+			externally stored columns. */
+			row = row_build(ROW_COPY_POINTERS, clust_index,
+					prev_version, clust_offsets,
+					NULL, NULL, NULL, &ext, heap);
+
+			if (dict_index_has_virtual(index)) {
+				ut_ad(cur_vrow);
+				ut_ad(row->n_v_fields == cur_vrow->n_v_fields);
+				dtuple_copy_v_fields(row, cur_vrow);
+			}
+
+			entry = row_build_index_entry(row, ext, index, heap);
+
+			/* If entry == NULL, the record contains unset
+			BLOB pointers.  This must be a freshly
+			inserted record that we can safely ignore.
+			For the justification, see the comments after
+			the previous row_build_index_entry() call. */
+
+			/* NOTE that we cannot do the comparison as binary
+			fields because maybe the secondary index record has
+			already been updated to a different binary value in
+			a char field, but the collation identifies the old
+			and new value anyway! */
+
+			if (entry && dtuple_coll_eq(*ientry, *entry)) {
+				goto unsafe_to_purge;
+			}
+		}
+
+		version = prev_version;
+	}
+}
+
+/*****************************************************************//**
+Constructs the version of a clustered index record which a consistent
+read should see. We assume that the trx id stored in rec is such that
+the consistent read should not see rec in its present version.
+@return error code
+@retval DB_SUCCESS if a previous version was fetched
+@retval DB_MISSING_HISTORY if the history is missing (a sign of corruption) */
+dberr_t
+row_vers_build_for_consistent_read(
+/*===============================*/
+	const rec_t*	rec,	/*!< in: record in a clustered index; the
+				caller must have a latch on the page; this
+				latch locks the top of the stack of versions
+				of this records */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec */
+	dict_index_t*	index,	/*!< in: the clustered index */
+	rec_offs**	offsets,/*!< in/out: offsets returned by
+				rec_get_offsets(rec, index) */
+	ReadView*	view,	/*!< in: the consistent read view */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	mem_heap_t*	in_heap,/*!< in: memory heap from which the memory for
+				*old_vers is allocated; memory for possible
+				intermediate versions is allocated and freed
+				locally within the function */
+	rec_t**		old_vers,/*!< out, own: old version, or NULL
+				if the history is missing or the record
+				does not exist in the view, that is,
+				it was freshly inserted afterwards */
+	dtuple_t**	vrow)	/*!< out: virtual row */
+{
+	const rec_t*	version;
+	rec_t*		prev_version;
+	trx_id_t	trx_id;
+	mem_heap_t*	heap		= NULL;
+	byte*		buf;
+	dberr_t		err;
+
+	ut_ad(index->is_primary());
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
+
+	ut_ad(rec_offs_validate(rec, index, *offsets));
+
+	trx_id = row_get_rec_trx_id(rec, index, *offsets);
+
+	ut_ad(!view->changes_visible(trx_id));
+
+	ut_ad(!vrow || !(*vrow));
+
+	version = rec;
+
+	for (;;) {
+		mem_heap_t*	prev_heap = heap;
+
+		heap = mem_heap_create(1024);
+
+		if (vrow) {
+			*vrow = NULL;
+		}
+
+		/* If purge can't see the record then we can't rely on
+		the UNDO log record. */
+
+		err = trx_undo_prev_version_build(
+			version, index, *offsets, heap,
+			&prev_version, NULL, vrow, 0);
+
+		if (prev_heap != NULL) {
+			mem_heap_free(prev_heap);
+		}
+
+		if (prev_version == NULL) {
+			/* It was a freshly inserted version */
+			*old_vers = NULL;
+			ut_ad(!vrow || !(*vrow));
+			break;
+		}
+
+		*offsets = rec_get_offsets(
+			prev_version, index, *offsets,
+			index->n_core_fields, ULINT_UNDEFINED, offset_heap);
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+		ut_a(!rec_offs_any_null_extern(prev_version, *offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+		trx_id = row_get_rec_trx_id(prev_version, index, *offsets);
+
+		if (view->changes_visible(trx_id)) {
+
+			/* The view already sees this version: we can copy
+			it to in_heap and return */
+
+			buf = static_cast<byte*>(
+				mem_heap_alloc(
+					in_heap, rec_offs_size(*offsets)));
+
+			*old_vers = rec_copy(buf, prev_version, *offsets);
+			rec_offs_make_valid(*old_vers, index, true, *offsets);
+
+			if (vrow && *vrow) {
+				*vrow = dtuple_copy(*vrow, in_heap);
+				dtuple_dup_v_fld(*vrow, in_heap);
+			}
+			break;
+		} else if (trx_id >= view->low_limit_id()
+			   && trx_id >= trx_sys.get_max_trx_id()) {
+			err = DB_CORRUPTION;
+			break;
+		}
+		version = prev_version;
+	}
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__
+/* Avoid GCC 4.8.5 internal compiler error "could not split insn". */
+# pragma GCC optimize ("O0")
+#endif
+/*****************************************************************//**
+Constructs the last committed version of a clustered index record,
+which should be seen by a semi-consistent read. */
+void
+row_vers_build_for_semi_consistent_read(
+/*====================================*/
+	trx_t*		caller_trx,/*!<in/out: trx of current thread */
+	const rec_t*	rec,	/*!< in: record in a clustered index; the
+				caller must have a latch on the page; this
+				latch locks the top of the stack of versions
+				of this records */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec */
+	dict_index_t*	index,	/*!< in: the clustered index */
+	rec_offs**	offsets,/*!< in/out: offsets returned by
+				rec_get_offsets(rec, index) */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	mem_heap_t*	in_heap,/*!< in: memory heap from which the memory for
+				*old_vers is allocated; memory for possible
+				intermediate versions is allocated and freed
+				locally within the function */
+	const rec_t**	old_vers,/*!< out: rec, old version, or NULL if the
+				record does not exist in the view, that is,
+				it was freshly inserted afterwards */
+	dtuple_t**	vrow)	/*!< out: virtual row, old version, or NULL
+				if it is not updated in the view */
+{
+	const rec_t*	version;
+	mem_heap_t*	heap		= NULL;
+	byte*		buf;
+	trx_id_t	rec_trx_id	= 0;
+
+	ut_ad(index->is_primary());
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
+
+	ut_ad(rec_offs_validate(rec, index, *offsets));
+
+	version = rec;
+	ut_ad(!vrow || !(*vrow));
+
+	for (;;) {
+		mem_heap_t*	heap2;
+		rec_t*		prev_version;
+		trx_id_t	version_trx_id;
+
+		version_trx_id = row_get_rec_trx_id(version, index, *offsets);
+		if (rec == version) {
+			rec_trx_id = version_trx_id;
+		}
+
+		if (!trx_sys.is_registered(caller_trx, version_trx_id)) {
+committed_version_trx:
+			/* We found a version that belongs to a
+			committed transaction: return it. */
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+			ut_a(!rec_offs_any_null_extern(version, *offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+			if (rec == version) {
+				*old_vers = rec;
+				if (vrow) {
+					*vrow = NULL;
+				}
+				break;
+			}
+
+			/* We assume that a rolled-back transaction stays in
+			TRX_STATE_ACTIVE state until all the changes have been
+			rolled back and the transaction is removed from
+			the global list of transactions. */
+
+			if (rec_trx_id == version_trx_id) {
+				/* The transaction was committed while
+				we searched for earlier versions.
+				Return the current version as a
+				semi-consistent read. */
+
+				version = rec;
+				*offsets = rec_get_offsets(
+					version, index, *offsets,
+					index->n_core_fields, ULINT_UNDEFINED,
+					offset_heap);
+			}
+
+			buf = static_cast<byte*>(
+				mem_heap_alloc(
+					in_heap, rec_offs_size(*offsets)));
+
+			*old_vers = rec_copy(buf, version, *offsets);
+			rec_offs_make_valid(*old_vers, index, true, *offsets);
+			if (vrow && *vrow) {
+				*vrow = dtuple_copy(*vrow, in_heap);
+				dtuple_dup_v_fld(*vrow, in_heap);
+			}
+			break;
+		}
+
+		DEBUG_SYNC_C("after_row_vers_check_trx_active");
+
+		heap2 = heap;
+		heap = mem_heap_create(1024);
+
+		if (trx_undo_prev_version_build(version, index, *offsets, heap,
+						&prev_version, in_heap, vrow,
+						0) != DB_SUCCESS) {
+			mem_heap_free(heap);
+			heap = heap2;
+			heap2 = NULL;
+			goto committed_version_trx;
+		}
+
+		if (heap2) {
+			mem_heap_free(heap2); /* free version */
+		}
+
+		if (prev_version == NULL) {
+			/* It was a freshly inserted version */
+			*old_vers = NULL;
+			ut_ad(!vrow || !(*vrow));
+			break;
+		}
+
+		version = prev_version;
+		*offsets = rec_get_offsets(version, index, *offsets,
+					   index->n_core_fields,
+					   ULINT_UNDEFINED, offset_heap);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+		ut_a(!rec_offs_any_null_extern(version, *offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+	}/* for (;;) */
+
+	if (heap) {
+		mem_heap_free(heap);
+	}
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 18:00:34 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 18:00:34 +0000
commit	3f619478f796eddbba6e39502fe941b285dd97b1 (patch)
tree	e2c7b5777f728320e5b5542b6213fd3591ba51e2 /storage/innobase/row
parent	Initial commit. (diff)
download	mariadb-upstream.tar.xz mariadb-upstream.zip